aboutsummaryrefslogtreecommitdiff
path: root/net/dccp
diff options
context:
space:
mode:
Diffstat (limited to 'net/dccp')
-rw-r--r--net/dccp/Kconfig48
-rw-r--r--net/dccp/Makefile30
-rw-r--r--net/dccp/ackvec.c657
-rw-r--r--net/dccp/ackvec.h205
-rw-r--r--net/dccp/ccid.c248
-rw-r--r--net/dccp/ccid.h246
-rw-r--r--net/dccp/ccids/Kconfig53
-rw-r--r--net/dccp/ccids/Makefile5
-rw-r--r--net/dccp/ccids/ccid2.c784
-rw-r--r--net/dccp/ccids/ccid2.h133
-rw-r--r--net/dccp/ccids/ccid3.c1582
-rw-r--r--net/dccp/ccids/ccid3.h182
-rw-r--r--net/dccp/ccids/lib/Makefile3
-rw-r--r--net/dccp/ccids/lib/loss_interval.c247
-rw-r--r--net/dccp/ccids/lib/loss_interval.h80
-rw-r--r--net/dccp/ccids/lib/packet_history.c649
-rw-r--r--net/dccp/ccids/lib/packet_history.h223
-rw-r--r--net/dccp/ccids/lib/tfrc.c45
-rw-r--r--net/dccp/ccids/lib/tfrc.h73
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c267
-rw-r--r--net/dccp/dccp.h592
-rw-r--r--net/dccp/diag.c31
-rw-r--r--net/dccp/feat.c1561
-rw-r--r--net/dccp/feat.h137
-rw-r--r--net/dccp/input.c579
-rw-r--r--net/dccp/ipv4.c1131
-rw-r--r--net/dccp/ipv6.c1297
-rw-r--r--net/dccp/ipv6.h3
-rw-r--r--net/dccp/minisocks.c181
-rw-r--r--net/dccp/options.c599
-rw-r--r--net/dccp/output.c606
-rw-r--r--net/dccp/probe.c203
-rw-r--r--net/dccp/proto.c919
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c118
-rw-r--r--net/dccp/timer.c204
36 files changed, 8922 insertions, 5136 deletions
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
index 187ac182e24..8c0ef71bed2 100644
--- a/net/dccp/Kconfig
+++ b/net/dccp/Kconfig
@@ -1,50 +1,62 @@
-menu "DCCP Configuration (EXPERIMENTAL)"
- depends on INET && EXPERIMENTAL
-
-config IP_DCCP
- tristate "The DCCP Protocol (EXPERIMENTAL)"
+menuconfig IP_DCCP
+ tristate "The DCCP Protocol"
+ depends on INET
---help---
- Datagram Congestion Control Protocol
+ Datagram Congestion Control Protocol (RFC 4340)
- From draft-ietf-dccp-spec-11 <http://www.icir.org/kohler/dcp/draft-ietf-dccp-spec-11.txt>.
+ From http://www.ietf.org/rfc/rfc4340.txt:
The Datagram Congestion Control Protocol (DCCP) is a transport
protocol that implements bidirectional, unicast connections of
congestion-controlled, unreliable datagrams. It should be suitable
for use by applications such as streaming media, Internet telephony,
- and on-line games
+ and on-line games.
To compile this protocol support as a module, choose M here: the
module will be called dccp.
If in doubt, say N.
+if IP_DCCP
+
config INET_DCCP_DIAG
- depends on IP_DCCP && INET_DIAG
+ depends on INET_DIAG
def_tristate y if (IP_DCCP = y && INET_DIAG = y)
def_tristate m
source "net/dccp/ccids/Kconfig"
menu "DCCP Kernel Hacking"
- depends on IP_DCCP && DEBUG_KERNEL=y
+ depends on DEBUG_KERNEL=y
config IP_DCCP_DEBUG
bool "DCCP debug messages"
---help---
Only use this if you're hacking DCCP.
+ When compiling DCCP as a module, this debugging output can be toggled
+ by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
+
Just say N.
-config IP_DCCP_UNLOAD_HACK
- depends on IP_DCCP=m && IP_DCCP_CCID3=m
- bool "DCCP control sock unload hack"
+config NET_DCCPPROBE
+ tristate "DCCP connection probing"
+ depends on PROC_FS && KPROBES
---help---
- Enable this to be able to unload the dccp module when the it
- has only one refcount held, the control sock one. Just execute
- "rmmod dccp_ccid3 dccp"
+ This module allows for capturing the changes to DCCP connection
+ state in response to incoming packets. It is used for debugging
+ DCCP congestion avoidance modules. If you don't understand
+ what was just said, you don't need it: say N.
+
+ Documentation on how to use DCCP connection probing can be found
+ at:
+
+ http://www.linuxfoundation.org/collaborate/workgroups/networking/dccpprobe
+
+ To compile this code as a module, choose M here: the
+ module will be called dccp_probe.
- Just say N.
-endmenu
endmenu
+
+endif # IP_DDCP
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
index 87b27fff6e3..5c8362b037e 100644
--- a/net/dccp/Makefile
+++ b/net/dccp/Makefile
@@ -1,16 +1,28 @@
-obj-$(CONFIG_IPV6) += dccp_ipv6.o
+obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-dccp_ipv6-y := ipv6.o
-
-obj-$(CONFIG_IP_DCCP) += dccp.o
+dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
+ qpolicy.o
+#
+# CCID algorithms to be used by dccp.ko
+#
+# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
+dccp-y += ccids/ccid2.o ackvec.o
+dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o
+dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \
+ ccids/lib/tfrc_equation.o \
+ ccids/lib/packet_history.o \
+ ccids/lib/loss_interval.o
-dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \
- timer.o
+dccp_ipv4-y := ipv4.o
-dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o
+# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
+obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
+dccp_ipv6-y := ipv6.o
obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
+obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o
-dccp_diag-y := diag.o
+dccp-$(CONFIG_SYSCTL) += sysctl.o
-obj-y += ccids/
+dccp_diag-y := diag.o
+dccp_probe-y := probe.o
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index ce9cb77c5c2..ba07824af4c 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -1,426 +1,409 @@
/*
* net/dccp/ackvec.c
*
- * An implementation of the DCCP protocol
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; version 2 of the License;
*/
-
-#include "ackvec.h"
#include "dccp.h"
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/export.h>
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-
-#include <net/sock.h>
+static struct kmem_cache *dccp_ackvec_slab;
+static struct kmem_cache *dccp_ackvec_record_slab;
-int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
- int len = av->dccpav_vec_len + 2;
- struct timeval now;
- u32 elapsed_time;
- unsigned char *to, *from;
-
- dccp_timestamp(sk, &now);
- elapsed_time = timeval_delta(&now, &av->dccpav_time) / 10;
-
- if (elapsed_time != 0)
- dccp_insert_option_elapsed_time(sk, skb, elapsed_time);
+ struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- /*
- * XXX: now we have just one ack vector sent record, so
- * we have to wait for it to be cleared.
- *
- * Of course this is not acceptable, but this is just for
- * basic testing now.
- */
- if (av->dccpav_ack_seqno != DCCP_MAX_SEQNO + 1)
- return -1;
+ if (av != NULL) {
+ av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
+ INIT_LIST_HEAD(&av->av_records);
+ }
+ return av;
+}
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
+{
+ struct dccp_ackvec_record *cur, *next;
- to = skb_push(skb, len);
- *to++ = DCCPO_ACK_VECTOR_0;
- *to++ = len;
+ list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
+ kmem_cache_free(dccp_ackvec_record_slab, cur);
+ INIT_LIST_HEAD(&av->av_records);
+}
- len = av->dccpav_vec_len;
- from = av->dccpav_buf + av->dccpav_buf_head;
+void dccp_ackvec_free(struct dccp_ackvec *av)
+{
+ if (likely(av != NULL)) {
+ dccp_ackvec_purge_records(av);
+ kmem_cache_free(dccp_ackvec_slab, av);
+ }
+}
- /* Check if buf_head wraps */
- if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) {
- const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head;
+/**
+ * dccp_ackvec_update_records - Record information about sent Ack Vectors
+ * @av: Ack Vector records to update
+ * @seqno: Sequence number of the packet carrying the Ack Vector just sent
+ * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
+ */
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
+{
+ struct dccp_ackvec_record *avr;
- memcpy(to, from, tailsize);
- to += tailsize;
- len -= tailsize;
- from = av->dccpav_buf;
- }
+ avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
+ if (avr == NULL)
+ return -ENOBUFS;
- memcpy(to, from, len);
+ avr->avr_ack_seqno = seqno;
+ avr->avr_ack_ptr = av->av_buf_head;
+ avr->avr_ack_ackno = av->av_buf_ackno;
+ avr->avr_ack_nonce = nonce_sum;
+ avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
/*
- * From draft-ietf-dccp-spec-11.txt:
- *
- * For each acknowledgement it sends, the HC-Receiver will add an
- * acknowledgement record. ack_seqno will equal the HC-Receiver
- * sequence number it used for the ack packet; ack_ptr will equal
- * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will
- * equal buf_nonce.
- *
- * This implemention uses just one ack record for now.
+ * When the buffer overflows, we keep no more than one record. This is
+ * the simplest way of disambiguating sender-Acks dating from before the
+ * overflow from sender-Acks which refer to after the overflow; a simple
+ * solution is preferable here since we are handling an exception.
*/
- av->dccpav_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- av->dccpav_ack_ptr = av->dccpav_buf_head;
- av->dccpav_ack_ackno = av->dccpav_buf_ackno;
- av->dccpav_ack_nonce = av->dccpav_buf_nonce;
- av->dccpav_sent_len = av->dccpav_vec_len;
-
- dccp_pr_debug("%sACK Vector 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu\n",
- debug_prefix, av->dccpav_sent_len,
- (unsigned long long)av->dccpav_ack_seqno,
- (unsigned long long)av->dccpav_ack_ackno);
- return -1;
+ if (av->av_overflow)
+ dccp_ackvec_purge_records(av);
+ /*
+ * Since GSS is incremented for each packet, the list is automatically
+ * arranged in descending order of @ack_seqno.
+ */
+ list_add(&avr->avr_node, &av->av_records);
+
+ dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
+ (unsigned long long)avr->avr_ack_seqno,
+ (unsigned long long)avr->avr_ack_ackno,
+ avr->avr_ack_runlen);
+ return 0;
}
-struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
- const gfp_t priority)
+static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
+ const u64 ackno)
{
- struct dccp_ackvec *av;
-
- BUG_ON(len == 0);
-
- if (len > DCCP_MAX_ACKVEC_LEN)
- return NULL;
-
- av = kmalloc(sizeof(*av) + len, priority);
- if (av != NULL) {
- av->dccpav_buf_len = len;
- av->dccpav_buf_head =
- av->dccpav_buf_tail = av->dccpav_buf_len - 1;
- av->dccpav_buf_ackno =
- av->dccpav_ack_ackno = av->dccpav_ack_seqno = ~0LLU;
- av->dccpav_buf_nonce = av->dccpav_buf_nonce = 0;
- av->dccpav_ack_ptr = 0;
- av->dccpav_time.tv_sec = 0;
- av->dccpav_time.tv_usec = 0;
- av->dccpav_sent_len = av->dccpav_vec_len = 0;
+ struct dccp_ackvec_record *avr;
+ /*
+ * Exploit that records are inserted in descending order of sequence
+ * number, start with the oldest record first. If @ackno is `before'
+ * the earliest ack_ackno, the packet is too old to be considered.
+ */
+ list_for_each_entry_reverse(avr, av_list, avr_node) {
+ if (avr->avr_ack_seqno == ackno)
+ return avr;
+ if (before48(ackno, avr->avr_ack_seqno))
+ break;
}
-
- return av;
+ return NULL;
}
-void dccp_ackvec_free(struct dccp_ackvec *av)
+/*
+ * Buffer index and length computation using modulo-buffersize arithmetic.
+ * Note that, as pointers move from right to left, head is `before' tail.
+ */
+static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
{
- kfree(av);
+ return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
}
-static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av,
- const u8 index)
+static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
{
- return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK;
+ return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
}
-static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av,
- const u8 index)
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
{
- return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK;
+ if (unlikely(av->av_overflow))
+ return DCCPAV_MAX_ACKVEC_LEN;
+ return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
}
-/*
- * If several packets are missing, the HC-Receiver may prefer to enter multiple
- * bytes with run length 0, rather than a single byte with a larger run length;
- * this simplifies table updates if one of the missing packets arrives.
+/**
+ * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
+ * @av: non-empty buffer to update
+ * @distance: negative or zero distance of @seqno from buf_ackno downward
+ * @seqno: the (old) sequence number whose record is to be updated
+ * @state: state in which packet carrying @seqno was received
*/
-static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av,
- const unsigned int packets,
- const unsigned char state)
+static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
+ u64 seqno, enum dccp_ackvec_states state)
{
- unsigned int gap;
- signed long new_head;
+ u16 ptr = av->av_buf_head;
- if (av->dccpav_vec_len + packets > av->dccpav_buf_len)
- return -ENOBUFS;
+ BUG_ON(distance > 0);
+ if (unlikely(dccp_ackvec_is_empty(av)))
+ return;
- gap = packets - 1;
- new_head = av->dccpav_buf_head - packets;
+ do {
+ u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
- if (new_head < 0) {
- if (gap > 0) {
- memset(av->dccpav_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED,
- gap + new_head + 1);
- gap = -new_head;
+ if (distance + runlen >= 0) {
+ /*
+ * Only update the state if packet has not been received
+ * yet. This is OK as per the second table in RFC 4340,
+ * 11.4.1; i.e. here we are using the following table:
+ * RECEIVED
+ * 0 1 3
+ * S +---+---+---+
+ * T 0 | 0 | 0 | 0 |
+ * O +---+---+---+
+ * R 1 | 1 | 1 | 1 |
+ * E +---+---+---+
+ * D 3 | 0 | 1 | 3 |
+ * +---+---+---+
+ * The "Not Received" state was set by reserve_seats().
+ */
+ if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
+ av->av_buf[ptr] = state;
+ else
+ dccp_pr_debug("Not changing %llu state to %u\n",
+ (unsigned long long)seqno, state);
+ break;
}
- new_head += av->dccpav_buf_len;
- }
- av->dccpav_buf_head = new_head;
+ distance += runlen + 1;
+ ptr = __ackvec_idx_add(ptr, 1);
- if (gap > 0)
- memset(av->dccpav_buf + av->dccpav_buf_head + 1,
- DCCP_ACKVEC_STATE_NOT_RECEIVED, gap);
+ } while (ptr != av->av_buf_tail);
+}
- av->dccpav_buf[av->dccpav_buf_head] = state;
- av->dccpav_vec_len += packets;
- return 0;
+/* Mark @num entries after buf_head as "Not yet received". */
+static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
+{
+ u16 start = __ackvec_idx_add(av->av_buf_head, 1),
+ len = DCCPAV_MAX_ACKVEC_LEN - start;
+
+ /* check for buffer wrap-around */
+ if (num > len) {
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
+ start = 0;
+ num -= len;
+ }
+ if (num)
+ memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
}
-/*
- * Implements the draft-ietf-dccp-spec-11.txt Appendix A
+/**
+ * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
+ * @av: container of buffer to update (can be empty or non-empty)
+ * @num_packets: number of packets to register (must be >= 1)
+ * @seqno: sequence number of the first packet in @num_packets
+ * @state: state in which packet carrying @seqno was received
*/
-int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
+static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
+ u64 seqno, enum dccp_ackvec_states state)
{
- /*
- * Check at the right places if the buffer is full, if it is, tell the
- * caller to start dropping packets till the HC-Sender acks our ACK
- * vectors, when we will free up space in dccpav_buf.
- *
- * We may well decide to do buffer compression, etc, but for now lets
- * just drop.
- *
- * From Appendix A:
- *
- * Of course, the circular buffer may overflow, either when the
- * HC-Sender is sending data at a very high rate, when the
- * HC-Receiver's acknowledgements are not reaching the HC-Sender,
- * or when the HC-Sender is forgetting to acknowledge those acks
- * (so the HC-Receiver is unable to clean up old state). In this
- * case, the HC-Receiver should either compress the buffer (by
- * increasing run lengths when possible), transfer its state to
- * a larger buffer, or, as a last resort, drop all received
- * packets, without processing them whatsoever, until its buffer
- * shrinks again.
- */
+ u32 num_cells = num_packets;
- /* See if this is the first ackno being inserted */
- if (av->dccpav_vec_len == 0) {
- av->dccpav_buf[av->dccpav_buf_head] = state;
- av->dccpav_vec_len = 1;
- } else if (after48(ackno, av->dccpav_buf_ackno)) {
- const u64 delta = dccp_delta_seqno(av->dccpav_buf_ackno,
- ackno);
+ if (num_packets > DCCPAV_BURST_THRESH) {
+ u32 lost_packets = num_packets - 1;
+ DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
/*
- * Look if the state of this packet is the same as the
- * previous ackno and if so if we can bump the head len.
- */
- if (delta == 1 &&
- dccp_ackvec_state(av, av->dccpav_buf_head) == state &&
- (dccp_ackvec_len(av, av->dccpav_buf_head) <
- DCCP_ACKVEC_LEN_MASK))
- av->dccpav_buf[av->dccpav_buf_head]++;
- else if (dccp_ackvec_set_buf_head_state(av, delta, state))
- return -ENOBUFS;
- } else {
- /*
- * A.1.2. Old Packets
- *
- * When a packet with Sequence Number S arrives, and
- * S <= buf_ackno, the HC-Receiver will scan the table
- * for the byte corresponding to S. (Indexing structures
- * could reduce the complexity of this scan.)
+ * We received 1 packet and have a loss of size "num_packets-1"
+ * which we squeeze into num_cells-1 rather than reserving an
+ * entire byte for each lost packet.
+ * The reason is that the vector grows in O(burst_length); when
+ * it grows too large there will no room left for the payload.
+ * This is a trade-off: if a few packets out of the burst show
+ * up later, their state will not be changed; it is simply too
+ * costly to reshuffle/reallocate/copy the buffer each time.
+ * Should such problems persist, we will need to switch to a
+ * different underlying data structure.
*/
- u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno);
- u8 index = av->dccpav_buf_head;
+ for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
+ u8 len = min(lost_packets, (u32)DCCPAV_MAX_RUNLEN);
- while (1) {
- const u8 len = dccp_ackvec_len(av, index);
- const u8 state = dccp_ackvec_state(av, index);
- /*
- * valid packets not yet in dccpav_buf have a reserved
- * entry, with a len equal to 0.
- */
- if (state == DCCP_ACKVEC_STATE_NOT_RECEIVED &&
- len == 0 && delta == 0) { /* Found our
- reserved seat! */
- dccp_pr_debug("Found %llu reserved seat!\n",
- (unsigned long long)ackno);
- av->dccpav_buf[index] = state;
- goto out;
- }
- /* len == 0 means one packet */
- if (delta < len + 1)
- goto out_duplicate;
-
- delta -= len + 1;
- if (++index == av->dccpav_buf_len)
- index = 0;
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
+ av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
+
+ lost_packets -= len;
}
}
- av->dccpav_buf_ackno = ackno;
- dccp_timestamp(sk, &av->dccpav_time);
-out:
- dccp_pr_debug("");
- return 0;
+ if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
+ DCCP_CRIT("Ack Vector buffer overflow: dropping old entries\n");
+ av->av_overflow = true;
+ }
+
+ av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
+ if (av->av_overflow)
+ av->av_buf_tail = av->av_buf_head;
-out_duplicate:
- /* Duplicate packet */
- dccp_pr_debug("Received a dup or already considered lost "
- "packet: %llu\n", (unsigned long long)ackno);
- return -EILSEQ;
+ av->av_buf[av->av_buf_head] = state;
+ av->av_buf_ackno = seqno;
+
+ if (num_packets > 1)
+ dccp_ackvec_reserve_seats(av, num_packets - 1);
}
-#ifdef CONFIG_IP_DCCP_DEBUG
-void dccp_ackvector_print(const u64 ackno, const unsigned char *vector, int len)
+/**
+ * dccp_ackvec_input - Register incoming packet in the buffer
+ */
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
{
- if (!dccp_debug)
- return;
+ u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ enum dccp_ackvec_states state = DCCPAV_RECEIVED;
- printk("ACK vector len=%d, ackno=%llu |", len,
- (unsigned long long)ackno);
+ if (dccp_ackvec_is_empty(av)) {
+ dccp_ackvec_add_new(av, 1, seqno, state);
+ av->av_tail_ackno = seqno;
- while (len--) {
- const u8 state = (*vector & DCCP_ACKVEC_STATE_MASK) >> 6;
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
+ } else {
+ s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
+ u8 *current_head = av->av_buf + av->av_buf_head;
- printk("%d,%d|", state, rl);
- ++vector;
- }
+ if (num_packets == 1 &&
+ dccp_ackvec_state(current_head) == state &&
+ dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
- printk("\n");
-}
+ *current_head += 1;
+ av->av_buf_ackno = seqno;
-void dccp_ackvec_print(const struct dccp_ackvec *av)
-{
- dccp_ackvector_print(av->dccpav_buf_ackno,
- av->dccpav_buf + av->dccpav_buf_head,
- av->dccpav_vec_len);
+ } else if (num_packets > 0) {
+ dccp_ackvec_add_new(av, num_packets, seqno, state);
+ } else {
+ dccp_ackvec_update_old(av, num_packets, seqno, state);
+ }
+ }
}
-#endif
-static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av)
+/**
+ * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
+ * This routine is called when the peer acknowledges the receipt of Ack Vectors
+ * up to and including @ackno. While based on on section A.3 of RFC 4340, here
+ * are additional precautions to prevent corrupted buffer state. In particular,
+ * we use tail_ackno to identify outdated records; it always marks the earliest
+ * packet of group (2) in 11.4.2.
+ */
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
{
+ struct dccp_ackvec_record *avr, *next;
+ u8 runlen_now, eff_runlen;
+ s64 delta;
+
+ avr = dccp_ackvec_lookup(&av->av_records, ackno);
+ if (avr == NULL)
+ return;
/*
- * As we're keeping track of the ack vector size (dccpav_vec_len) and
- * the sent ack vector size (dccpav_sent_len) we don't need
- * dccpav_buf_tail at all, but keep this code here as in the future
- * we'll implement a vector of ack records, as suggested in
- * draft-ietf-dccp-spec-11.txt Appendix A. -acme
+ * Deal with outdated acknowledgments: this arises when e.g. there are
+ * several old records and the acks from the peer come in slowly. In
+ * that case we may still have records that pre-date tail_ackno.
*/
-#if 0
- u32 new_buf_tail = av->dccpav_ack_ptr + 1;
- if (new_buf_tail >= av->dccpav_vec_len)
- new_buf_tail -= av->dccpav_vec_len;
- av->dccpav_buf_tail = new_buf_tail;
-#endif
- av->dccpav_vec_len -= av->dccpav_sent_len;
-}
+ delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
+ if (delta < 0)
+ goto free_records;
+ /*
+ * Deal with overlapping Ack Vectors: don't subtract more than the
+ * number of packets between tail_ackno and ack_ackno.
+ */
+ eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
-void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk,
- const u64 ackno)
-{
- /* Check if we actually sent an ACK vector */
- if (av->dccpav_ack_seqno == DCCP_MAX_SEQNO + 1)
- return;
+ runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
+ /*
+ * The run length of Ack Vector cells does not decrease over time. If
+ * the run length is the same as at the time the Ack Vector was sent, we
+ * free the ack_ptr cell. That cell can however not be freed if the run
+ * length has increased: in this case we need to move the tail pointer
+ * backwards (towards higher indices), to its next-oldest neighbour.
+ */
+ if (runlen_now > eff_runlen) {
- if (ackno == av->dccpav_ack_seqno) {
-#ifdef CONFIG_IP_DCCP_DEBUG
- struct dccp_sock *dp = dccp_sk(sk);
- const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
- "CLIENT rx ack: " : "server rx ack: ";
-#endif
- dccp_pr_debug("%sACK packet 0, len=%d, ack_seqno=%llu, "
- "ack_ackno=%llu, ACKED!\n",
- debug_prefix, 1,
- (unsigned long long)av->dccpav_ack_seqno,
- (unsigned long long)av->dccpav_ack_ackno);
- dccp_ackvec_throw_away_ack_record(av);
- av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
- }
-}
+ av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
+ av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
-static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av,
- struct sock *sk, u64 ackno,
- const unsigned char len,
- const unsigned char *vector)
-{
- unsigned char i;
+ /* This move may not have cleared the overflow flag. */
+ if (av->av_overflow)
+ av->av_overflow = (av->av_buf_head == av->av_buf_tail);
+ } else {
+ av->av_buf_tail = avr->avr_ack_ptr;
+ /*
+ * We have made sure that avr points to a valid cell within the
+ * buffer. This cell is either older than head, or equals head
+ * (empty buffer): in both cases we no longer have any overflow.
+ */
+ av->av_overflow = 0;
+ }
- /* Check if we actually sent an ACK vector */
- if (av->dccpav_ack_seqno == DCCP_MAX_SEQNO + 1)
- return;
/*
- * We're in the receiver half connection, so if the received an ACK
- * vector ackno (e.g. 50) before dccpav_ack_seqno (e.g. 52), we're
- * not interested.
- *
- * Extra explanation with example:
- *
- * if we received an ACK vector with ackno 50, it can only be acking
- * 50, 49, 48, etc, not 52 (the seqno for the ACK vector we sent).
+ * The peer has acknowledged up to and including ack_ackno. Hence the
+ * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
*/
- /* dccp_pr_debug("is %llu < %llu? ", ackno, av->dccpav_ack_seqno); */
- if (before48(ackno, av->dccpav_ack_seqno)) {
- /* dccp_pr_debug_cat("yes\n"); */
- return;
+ av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
+
+free_records:
+ list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
+ list_del(&avr->avr_node);
+ kmem_cache_free(dccp_ackvec_record_slab, avr);
}
- /* dccp_pr_debug_cat("no\n"); */
+}
- i = len;
- while (i--) {
- const u8 rl = *vector & DCCP_ACKVEC_LEN_MASK;
- u64 ackno_end_rl;
+/*
+ * Routines to keep track of Ack Vectors received in an skb
+ */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
+{
+ struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
- dccp_set_seqno(&ackno_end_rl, ackno - rl);
+ if (new == NULL)
+ return -ENOBUFS;
+ new->vec = vec;
+ new->len = len;
+ new->nonce = nonce;
- /*
- * dccp_pr_debug("is %llu <= %llu <= %llu? ", ackno_end_rl,
- * av->dccpav_ack_seqno, ackno);
- */
- if (between48(av->dccpav_ack_seqno, ackno_end_rl, ackno)) {
- const u8 state = (*vector &
- DCCP_ACKVEC_STATE_MASK) >> 6;
- /* dccp_pr_debug_cat("yes\n"); */
-
- if (state != DCCP_ACKVEC_STATE_NOT_RECEIVED) {
-#ifdef CONFIG_IP_DCCP_DEBUG
- struct dccp_sock *dp = dccp_sk(sk);
- const char *debug_prefix =
- dp->dccps_role == DCCP_ROLE_CLIENT ?
- "CLIENT rx ack: " : "server rx ack: ";
-#endif
- dccp_pr_debug("%sACK vector 0, len=%d, "
- "ack_seqno=%llu, ack_ackno=%llu, "
- "ACKED!\n",
- debug_prefix, len,
- (unsigned long long)
- av->dccpav_ack_seqno,
- (unsigned long long)
- av->dccpav_ack_ackno);
- dccp_ackvec_throw_away_ack_record(av);
- }
- /*
- * If dccpav_ack_seqno was not received, no problem
- * we'll send another ACK vector.
- */
- av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1;
- break;
- }
- /* dccp_pr_debug_cat("no\n"); */
+ list_add_tail(&new->node, head);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
- dccp_set_seqno(&ackno, ackno_end_rl - 1);
- ++vector;
- }
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
+{
+ struct dccp_ackvec_parsed *cur, *next;
+
+ list_for_each_entry_safe(cur, next, parsed_chunks, node)
+ kfree(cur);
+ INIT_LIST_HEAD(parsed_chunks);
}
+EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
-int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- const u8 opt, const u8 *value, const u8 len)
+int __init dccp_ackvec_init(void)
{
- if (len > DCCP_MAX_ACKVEC_LEN)
- return -1;
+ dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
+ sizeof(struct dccp_ackvec), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (dccp_ackvec_slab == NULL)
+ goto out_err;
+
+ dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
+ sizeof(struct dccp_ackvec_record),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (dccp_ackvec_record_slab == NULL)
+ goto out_destroy_slab;
- /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */
- dccp_ackvec_check_rcv_ackvector(dccp_sk(sk)->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_ack_seq,
- len, value);
return 0;
+
+out_destroy_slab:
+ kmem_cache_destroy(dccp_ackvec_slab);
+ dccp_ackvec_slab = NULL;
+out_err:
+ DCCP_CRIT("Unable to create Ack Vector slab cache");
+ return -ENOBUFS;
+}
+
+void dccp_ackvec_exit(void)
+{
+ if (dccp_ackvec_slab != NULL) {
+ kmem_cache_destroy(dccp_ackvec_slab);
+ dccp_ackvec_slab = NULL;
+ }
+ if (dccp_ackvec_record_slab != NULL) {
+ kmem_cache_destroy(dccp_ackvec_record_slab);
+ dccp_ackvec_record_slab = NULL;
+ }
}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index f7dfb5f67b8..3284bfa988c 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -3,131 +3,136 @@
/*
* net/dccp/ackvec.h
*
- * An implementation of the DCCP protocol
+ * An implementation of Ack Vectors for the DCCP protocol
+ * Copyright (c) 2007 University of Aberdeen, Scotland, UK
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-#include <linux/config.h>
+#include <linux/dccp.h>
#include <linux/compiler.h>
-#include <linux/time.h>
+#include <linux/list.h>
#include <linux/types.h>
-/* Read about the ECN nonce to see why it is 253 */
-#define DCCP_MAX_ACKVEC_LEN 253
+/*
+ * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
+ * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
+ * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
+ * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
+ * The maximum value is bounded by the u16 types for indices and functions.
+ */
+#define DCCPAV_NUM_ACKVECS 2
+#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
+
+/* Estimated minimum average Ack Vector length - used for updating MPS */
+#define DCCPAV_MIN_OPTLEN 16
+
+/* Threshold for coping with large bursts of losses */
+#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
-#define DCCP_ACKVEC_STATE_RECEIVED 0
-#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
-#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
+enum dccp_ackvec_states {
+ DCCPAV_RECEIVED = 0x00,
+ DCCPAV_ECN_MARKED = 0x40,
+ DCCPAV_RESERVED = 0x80,
+ DCCPAV_NOT_RECEIVED = 0xC0
+};
+#define DCCPAV_MAX_RUNLEN 0x3F
-#define DCCP_ACKVEC_STATE_MASK 0xC0 /* 11000000 */
-#define DCCP_ACKVEC_LEN_MASK 0x3F /* 00111111 */
+static inline u8 dccp_ackvec_runlen(const u8 *cell)
+{
+ return *cell & DCCPAV_MAX_RUNLEN;
+}
+
+static inline u8 dccp_ackvec_state(const u8 *cell)
+{
+ return *cell & ~DCCPAV_MAX_RUNLEN;
+}
-/** struct dccp_ackvec - ack vector
+/**
+ * struct dccp_ackvec - Ack Vector main data structure
*
- * This data structure is the one defined in the DCCP draft
- * Appendix A.
+ * This implements a fixed-size circular buffer within an array and is largely
+ * based on Appendix A of RFC 4340.
*
- * @dccpav_buf_head - circular buffer head
- * @dccpav_buf_tail - circular buffer tail
- * @dccpav_buf_ackno - ack # of the most recent packet acknowledgeable in the
- * buffer (i.e. %dccpav_buf_head)
- * @dccpav_buf_nonce - the one-bit sum of the ECN Nonces on all packets acked
- * by the buffer with State 0
+ * @av_buf: circular buffer storage area
+ * @av_buf_head: head index; begin of live portion in @av_buf
+ * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
+ * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
+ * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
+ * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
+ * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
+ * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
+ */
+struct dccp_ackvec {
+ u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
+ u16 av_buf_head;
+ u16 av_buf_tail;
+ u64 av_buf_ackno:48;
+ u64 av_tail_ackno:48;
+ bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
+ u8 av_overflow:1;
+ struct list_head av_records;
+};
+
+/**
+ * struct dccp_ackvec_record - Records information about sent Ack Vectors
*
- * Additionally, the HC-Receiver must keep some information about the
- * Ack Vectors it has recently sent. For each packet sent carrying an
- * Ack Vector, it remembers four variables:
+ * These list entries define the additional information which the HC-Receiver
+ * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
*
- * @dccpav_ack_seqno - the Sequence Number used for the packet
- * (HC-Receiver seqno)
- * @dccpav_ack_ptr - the value of buf_head at the time of acknowledgement.
- * @dccpav_ack_ackno - the Acknowledgement Number used for the packet
- * (HC-Sender seqno)
- * @dccpav_ack_nonce - the one-bit sum of the ECN Nonces for all State 0.
+ * @avr_node: the list node in @av_records
+ * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
+ * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
+ * @avr_ack_ptr: pointer into @av_buf where this record starts
+ * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
+ * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
*
- * @dccpav_buf_len - circular buffer length
- * @dccpav_time - the time in usecs
- * @dccpav_buf - circular buffer of acknowledgeable packets
+ * The list as a whole is sorted in descending order by @avr_ack_seqno.
*/
-struct dccp_ackvec {
- u64 dccpav_buf_ackno;
- u64 dccpav_ack_seqno;
- u64 dccpav_ack_ackno;
- struct timeval dccpav_time;
- u8 dccpav_buf_head;
- u8 dccpav_buf_tail;
- u8 dccpav_ack_ptr;
- u8 dccpav_sent_len;
- u8 dccpav_vec_len;
- u8 dccpav_buf_len;
- u8 dccpav_buf_nonce;
- u8 dccpav_ack_nonce;
- u8 dccpav_buf[0];
+struct dccp_ackvec_record {
+ struct list_head avr_node;
+ u64 avr_ack_seqno:48;
+ u64 avr_ack_ackno:48;
+ u16 avr_ack_ptr;
+ u8 avr_ack_runlen;
+ u8 avr_ack_nonce:1;
};
-struct sock;
-struct sk_buff;
-
-#ifdef CONFIG_IP_DCCP_ACKVEC
-extern struct dccp_ackvec *dccp_ackvec_alloc(unsigned int len,
- const gfp_t priority);
-extern void dccp_ackvec_free(struct dccp_ackvec *av);
-
-extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state);
+int dccp_ackvec_init(void);
+void dccp_ackvec_exit(void);
-extern void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno);
-extern int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- const u8 opt, const u8 *value, const u8 len);
+struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
+void dccp_ackvec_free(struct dccp_ackvec *av);
-extern int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb);
+void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
+int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
+void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
+u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
-{
- return av->dccpav_sent_len != av->dccpav_vec_len;
-}
-#else /* CONFIG_IP_DCCP_ACKVEC */
-static inline struct dccp_ackvec *dccp_ackvec_alloc(unsigned int len,
- const gfp_t priority)
+static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
{
- return NULL;
+ return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
}
-static inline void dccp_ackvec_free(struct dccp_ackvec *av)
-{
-}
-
-static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
- const u64 ackno, const u8 state)
-{
- return -1;
-}
-
-static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av,
- struct sock *sk, const u64 ackno)
-{
-}
-
-static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb,
- const u8 opt, const u8 *value, const u8 len)
-{
- return -1;
-}
-
-static inline int dccp_insert_option_ackvec(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return -1;
-}
+/**
+ * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
+ * @vec: start of vector (offset into skb)
+ * @len: length of @vec
+ * @nonce: whether @vec had an ECN nonce of 0 or 1
+ * @node: FIFO - arranged in descending order of ack_ackno
+ *
+ * This structure is used by CCIDs to access Ack Vectors in a received skb.
+ */
+struct dccp_ackvec_parsed {
+ u8 *vec,
+ len,
+ nonce:1;
+ struct list_head node;
+};
-static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
-{
- return 0;
-}
-#endif /* CONFIG_IP_DCCP_ACKVEC */
+int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
+void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
index 9d8fc0e289e..597557254dd 100644
--- a/net/dccp/ccid.c
+++ b/net/dccp/ccid.c
@@ -11,129 +11,213 @@
* published by the Free Software Foundation.
*/
+#include <linux/slab.h>
+
#include "ccid.h"
+#include "ccids/lib/tfrc.h"
-static struct ccid *ccids[CCID_MAX];
-#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
-static atomic_t ccids_lockct = ATOMIC_INIT(0);
-static DEFINE_SPINLOCK(ccids_lock);
+static struct ccid_operations *ccids[] = {
+ &ccid2_ops,
+#ifdef CONFIG_IP_DCCP_CCID3
+ &ccid3_ops,
+#endif
+};
-/*
- * The strategy is: modifications ccids vector are short, do not sleep and
- * veeery rare, but read access should be free of any exclusive locks.
- */
-static void ccids_write_lock(void)
+static struct ccid_operations *ccid_by_number(const u8 id)
{
- spin_lock(&ccids_lock);
- while (atomic_read(&ccids_lockct) != 0) {
- spin_unlock(&ccids_lock);
- yield();
- spin_lock(&ccids_lock);
- }
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ccids); i++)
+ if (ccids[i]->ccid_id == id)
+ return ccids[i];
+ return NULL;
}
-static inline void ccids_write_unlock(void)
+/* check that up to @array_len members in @ccid_array are supported */
+bool ccid_support_check(u8 const *ccid_array, u8 array_len)
{
- spin_unlock(&ccids_lock);
+ while (array_len > 0)
+ if (ccid_by_number(ccid_array[--array_len]) == NULL)
+ return false;
+ return true;
}
-static inline void ccids_read_lock(void)
+/**
+ * ccid_get_builtin_ccids - Populate a list of built-in CCIDs
+ * @ccid_array: pointer to copy into
+ * @array_len: value to return length into
+ *
+ * This function allocates memory - caller must see that it is freed after use.
+ */
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
{
- atomic_inc(&ccids_lockct);
- spin_unlock_wait(&ccids_lock);
+ *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
+ if (*ccid_array == NULL)
+ return -ENOBUFS;
+
+ for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
+ (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
+ return 0;
}
-static inline void ccids_read_unlock(void)
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
{
- atomic_dec(&ccids_lockct);
-}
+ u8 *ccid_array, array_len;
+ int err = 0;
-#else
-#define ccids_write_lock() do { } while(0)
-#define ccids_write_unlock() do { } while(0)
-#define ccids_read_lock() do { } while(0)
-#define ccids_read_unlock() do { } while(0)
-#endif
+ if (ccid_get_builtin_ccids(&ccid_array, &array_len))
+ return -ENOBUFS;
-int ccid_register(struct ccid *ccid)
+ if (put_user(array_len, optlen))
+ err = -EFAULT;
+ else if (len > 0 && copy_to_user(optval, ccid_array,
+ len > array_len ? array_len : len))
+ err = -EFAULT;
+
+ kfree(ccid_array);
+ return err;
+}
+
+static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
{
- int err;
+ struct kmem_cache *slab;
+ va_list args;
- if (ccid->ccid_init == NULL)
- return -1;
+ va_start(args, fmt);
+ vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
+ va_end(args);
- ccids_write_lock();
- err = -EEXIST;
- if (ccids[ccid->ccid_id] == NULL) {
- ccids[ccid->ccid_id] = ccid;
- err = 0;
- }
- ccids_write_unlock();
- if (err == 0)
- pr_info("CCID: Registered CCID %d (%s)\n",
- ccid->ccid_id, ccid->ccid_name);
- return err;
+ slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ return slab;
}
-EXPORT_SYMBOL_GPL(ccid_register);
+static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
+{
+ if (slab != NULL)
+ kmem_cache_destroy(slab);
+}
-int ccid_unregister(struct ccid *ccid)
+static int ccid_activate(struct ccid_operations *ccid_ops)
{
- ccids_write_lock();
- ccids[ccid->ccid_id] = NULL;
- ccids_write_unlock();
- pr_info("CCID: Unregistered CCID %d (%s)\n",
- ccid->ccid_id, ccid->ccid_name);
- return 0;
+ int err = -ENOBUFS;
+
+ ccid_ops->ccid_hc_rx_slab =
+ ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
+ ccid_ops->ccid_hc_rx_slab_name,
+ "ccid%u_hc_rx_sock",
+ ccid_ops->ccid_id);
+ if (ccid_ops->ccid_hc_rx_slab == NULL)
+ goto out;
+
+ ccid_ops->ccid_hc_tx_slab =
+ ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
+ ccid_ops->ccid_hc_tx_slab_name,
+ "ccid%u_hc_tx_sock",
+ ccid_ops->ccid_id);
+ if (ccid_ops->ccid_hc_tx_slab == NULL)
+ goto out_free_rx_slab;
+
+ pr_info("DCCP: Activated CCID %d (%s)\n",
+ ccid_ops->ccid_id, ccid_ops->ccid_name);
+ err = 0;
+out:
+ return err;
+out_free_rx_slab:
+ ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+ ccid_ops->ccid_hc_rx_slab = NULL;
+ goto out;
}
-EXPORT_SYMBOL_GPL(ccid_unregister);
+static void ccid_deactivate(struct ccid_operations *ccid_ops)
+{
+ ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
+ ccid_ops->ccid_hc_tx_slab = NULL;
+ ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
+ ccid_ops->ccid_hc_rx_slab = NULL;
+
+ pr_info("DCCP: Deactivated CCID %d (%s)\n",
+ ccid_ops->ccid_id, ccid_ops->ccid_name);
+}
-struct ccid *ccid_init(unsigned char id, struct sock *sk)
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
{
- struct ccid *ccid;
+ struct ccid_operations *ccid_ops = ccid_by_number(id);
+ struct ccid *ccid = NULL;
-#ifdef CONFIG_KMOD
- if (ccids[id] == NULL)
- request_module("net-dccp-ccid-%d", id);
-#endif
- ccids_read_lock();
+ if (ccid_ops == NULL)
+ goto out;
- ccid = ccids[id];
+ ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
+ ccid_ops->ccid_hc_tx_slab, gfp_any());
if (ccid == NULL)
goto out;
-
- if (!try_module_get(ccid->ccid_owner))
- goto out_err;
-
- if (ccid->ccid_init(sk) != 0)
- goto out_module_put;
+ ccid->ccid_ops = ccid_ops;
+ if (rx) {
+ memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
+ if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
+ ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
+ goto out_free_ccid;
+ } else {
+ memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
+ if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
+ ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
+ goto out_free_ccid;
+ }
out:
- ccids_read_unlock();
return ccid;
-out_module_put:
- module_put(ccid->ccid_owner);
-out_err:
+out_free_ccid:
+ kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
+ ccid_ops->ccid_hc_tx_slab, ccid);
ccid = NULL;
goto out;
}
-EXPORT_SYMBOL_GPL(ccid_init);
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
+{
+ if (ccid != NULL) {
+ if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
+ ccid->ccid_ops->ccid_hc_rx_exit(sk);
+ kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
+ }
+}
-void ccid_exit(struct ccid *ccid, struct sock *sk)
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
{
- if (ccid == NULL)
- return;
+ if (ccid != NULL) {
+ if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
+ ccid->ccid_ops->ccid_hc_tx_exit(sk);
+ kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
+ }
+}
+
+int __init ccid_initialize_builtins(void)
+{
+ int i, err = tfrc_lib_init();
- ccids_read_lock();
+ if (err)
+ return err;
- if (ccids[ccid->ccid_id] != NULL) {
- if (ccid->ccid_exit != NULL)
- ccid->ccid_exit(sk);
- module_put(ccid->ccid_owner);
+ for (i = 0; i < ARRAY_SIZE(ccids); i++) {
+ err = ccid_activate(ccids[i]);
+ if (err)
+ goto unwind_registrations;
}
+ return 0;
- ccids_read_unlock();
+unwind_registrations:
+ while(--i >= 0)
+ ccid_deactivate(ccids[i]);
+ tfrc_lib_exit();
+ return err;
}
-EXPORT_SYMBOL_GPL(ccid_exit);
+void ccid_cleanup_builtins(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ccids); i++)
+ ccid_deactivate(ccids[i]);
+ tfrc_lib_exit();
+}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index de681c6ad08..6eb837a47b5 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -19,40 +19,61 @@
#include <linux/list.h>
#include <linux/module.h>
-#define CCID_MAX 255
+/* maximum value for a CCID (RFC 4340, 19.5) */
+#define CCID_MAX 255
+#define CCID_SLAB_NAME_LENGTH 32
struct tcp_info;
-struct ccid {
- unsigned char ccid_id;
- const char *ccid_name;
- struct module *ccid_owner;
- int (*ccid_init)(struct sock *sk);
- void (*ccid_exit)(struct sock *sk);
- int (*ccid_hc_rx_init)(struct sock *sk);
- int (*ccid_hc_tx_init)(struct sock *sk);
+/**
+ * struct ccid_operations - Interface to Congestion-Control Infrastructure
+ *
+ * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
+ * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
+ * @ccid_name: alphabetical identifier string for @ccid_id
+ * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
+ * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
+ *
+ * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
+ * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
+ * @ccid_hc_rx_packet_recv: implements the HC-receiver side
+ * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
+ * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
+ * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
+ * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
+ * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
+ * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
+ * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
+ */
+struct ccid_operations {
+ unsigned char ccid_id;
+ __u32 ccid_ccmps;
+ const char *ccid_name;
+ struct kmem_cache *ccid_hc_rx_slab,
+ *ccid_hc_tx_slab;
+ char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
+ char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
+ __u32 ccid_hc_rx_obj_size,
+ ccid_hc_tx_obj_size;
+ /* Interface Routines */
+ int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
+ int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
void (*ccid_hc_rx_exit)(struct sock *sk);
void (*ccid_hc_tx_exit)(struct sock *sk);
void (*ccid_hc_rx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
- void (*ccid_hc_rx_insert_options)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_insert_options)(struct sock *sk,
+ int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
+ int (*ccid_hc_rx_insert_options)(struct sock *sk,
struct sk_buff *skb);
void (*ccid_hc_tx_packet_recv)(struct sock *sk,
struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value);
+ int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
+ u8 opt, u8 *val, u8 len);
int (*ccid_hc_tx_send_packet)(struct sock *sk,
- struct sk_buff *skb, int len);
- void (*ccid_hc_tx_packet_sent)(struct sock *sk, int more,
- int len);
+ struct sk_buff *skb);
+ void (*ccid_hc_tx_packet_sent)(struct sock *sk,
+ unsigned int len);
void (*ccid_hc_rx_get_info)(struct sock *sk,
struct tcp_info *info);
void (*ccid_hc_tx_get_info)(struct sock *sk,
@@ -67,126 +88,157 @@ struct ccid {
int __user *optlen);
};
-extern int ccid_register(struct ccid *ccid);
-extern int ccid_unregister(struct ccid *ccid);
+extern struct ccid_operations ccid2_ops;
+#ifdef CONFIG_IP_DCCP_CCID3
+extern struct ccid_operations ccid3_ops;
+#endif
-extern struct ccid *ccid_init(unsigned char id, struct sock *sk);
-extern void ccid_exit(struct ccid *ccid, struct sock *sk);
+int ccid_initialize_builtins(void);
+void ccid_cleanup_builtins(void);
-static inline void __ccid_get(struct ccid *ccid)
-{
- __module_get(ccid->ccid_owner);
-}
+struct ccid {
+ struct ccid_operations *ccid_ops;
+ char ccid_priv[0];
+};
-static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb, int len)
+static inline void *ccid_priv(const struct ccid *ccid)
{
- int rc = 0;
- if (ccid->ccid_hc_tx_send_packet != NULL)
- rc = ccid->ccid_hc_tx_send_packet(sk, skb, len);
- return rc;
+ return (void *)ccid->ccid_priv;
}
-static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- int more, int len)
+bool ccid_support_check(u8 const *ccid_array, u8 array_len);
+int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
+int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
+ char __user *, int __user *);
+
+struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
+
+static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
{
- if (ccid->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_hc_tx_packet_sent(sk, more, len);
+ struct ccid *ccid = dp->dccps_hc_rx_ccid;
+
+ if (ccid == NULL || ccid->ccid_ops == NULL)
+ return -1;
+ return ccid->ccid_ops->ccid_id;
}
-static inline int ccid_hc_rx_init(struct ccid *ccid, struct sock *sk)
+static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
{
- int rc = 0;
- if (ccid->ccid_hc_rx_init != NULL)
- rc = ccid->ccid_hc_rx_init(sk);
- return rc;
+ struct ccid *ccid = dp->dccps_hc_tx_ccid;
+
+ if (ccid == NULL || ccid->ccid_ops == NULL)
+ return -1;
+ return ccid->ccid_ops->ccid_id;
}
-static inline int ccid_hc_tx_init(struct ccid *ccid, struct sock *sk)
+void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
+void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
+
+/*
+ * Congestion control of queued data packets via CCID decision.
+ *
+ * The TX CCID performs its congestion-control by indicating whether and when a
+ * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
+ * The following modes are supported via the symbolic constants below:
+ * - timer-based pacing (CCID returns a delay value in milliseconds);
+ * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
+ */
+
+enum ccid_dequeueing_decision {
+ CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
+ CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
+ CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
+ CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
+ CCID_PACKET_ERR = 0xF0000, /* error condition */
+};
+
+static inline int ccid_packet_dequeue_eval(const int return_code)
{
- int rc = 0;
- if (ccid->ccid_hc_tx_init != NULL)
- rc = ccid->ccid_hc_tx_init(sk);
- return rc;
+ if (return_code < 0)
+ return CCID_PACKET_ERR;
+ if (return_code == 0)
+ return CCID_PACKET_SEND_AT_ONCE;
+ if (return_code <= CCID_PACKET_DELAY_MAX)
+ return CCID_PACKET_DELAY;
+ return return_code;
}
-static inline void ccid_hc_rx_exit(struct ccid *ccid, struct sock *sk)
+static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
{
- if (ccid != NULL && ccid->ccid_hc_rx_exit != NULL &&
- dccp_sk(sk)->dccps_hc_rx_ccid_private != NULL)
- ccid->ccid_hc_rx_exit(sk);
+ if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
+ return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
+ return CCID_PACKET_SEND_AT_ONCE;
}
-static inline void ccid_hc_tx_exit(struct ccid *ccid, struct sock *sk)
+static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
+ unsigned int len)
{
- if (ccid != NULL && ccid->ccid_hc_tx_exit != NULL &&
- dccp_sk(sk)->dccps_hc_tx_ccid_private != NULL)
- ccid->ccid_hc_tx_exit(sk);
+ if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
+ ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
}
static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
- if (ccid->ccid_hc_rx_packet_recv != NULL)
- ccid->ccid_hc_rx_packet_recv(sk, skb);
+ if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
+ ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
}
static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
struct sk_buff *skb)
{
- if (ccid->ccid_hc_tx_packet_recv != NULL)
- ccid->ccid_hc_tx_packet_recv(sk, skb);
+ if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
+ ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
}
+/**
+ * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
+ * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
+ * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
+ * @val: value of @opt
+ * @len: length of @val in bytes
+ */
static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- int rc = 0;
- if (ccid->ccid_hc_tx_parse_options != NULL)
- rc = ccid->ccid_hc_tx_parse_options(sk, option, len, idx,
- value);
- return rc;
+ if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
}
+/**
+ * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
+ * Arguments are analogous to ccid_hc_tx_parse_options()
+ */
static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- unsigned char option,
- unsigned char len, u16 idx,
- unsigned char* value)
-{
- int rc = 0;
- if (ccid->ccid_hc_rx_parse_options != NULL)
- rc = ccid->ccid_hc_rx_parse_options(sk, option, len, idx, value);
- return rc;
-}
-
-static inline void ccid_hc_tx_insert_options(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
+ u8 pkt, u8 opt, u8 *val, u8 len)
{
- if (ccid->ccid_hc_tx_insert_options != NULL)
- ccid->ccid_hc_tx_insert_options(sk, skb);
+ if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
+ return 0;
+ return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
}
-static inline void ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
+static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
+ struct sk_buff *skb)
{
- if (ccid->ccid_hc_rx_insert_options != NULL)
- ccid->ccid_hc_rx_insert_options(sk, skb);
+ if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
+ return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
+ return 0;
}
static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
struct tcp_info *info)
{
- if (ccid->ccid_hc_rx_get_info != NULL)
- ccid->ccid_hc_rx_get_info(sk, info);
+ if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
+ ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
}
static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
struct tcp_info *info)
{
- if (ccid->ccid_hc_tx_get_info != NULL)
- ccid->ccid_hc_tx_get_info(sk, info);
+ if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
+ ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
}
static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
@@ -194,8 +246,8 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
- if (ccid->ccid_hc_rx_getsockopt != NULL)
- rc = ccid->ccid_hc_rx_getsockopt(sk, optname, len,
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
+ rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
}
@@ -205,8 +257,8 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
u32 __user *optval, int __user *optlen)
{
int rc = -ENOPROTOOPT;
- if (ccid->ccid_hc_tx_getsockopt != NULL)
- rc = ccid->ccid_hc_tx_getsockopt(sk, optname, len,
+ if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
+ rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
optval, optlen);
return rc;
}
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index 7684d83946a..8ba3fc9d6d1 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -1,29 +1,54 @@
-menu "DCCP CCIDs Configuration (EXPERIMENTAL)"
- depends on IP_DCCP && EXPERIMENTAL
+menu "DCCP CCIDs Configuration"
+
+config IP_DCCP_CCID2_DEBUG
+ bool "CCID-2 debugging messages"
+ ---help---
+ Enable CCID-2 specific debugging messages.
+
+ The debugging output can additionally be toggled by setting the
+ ccid2_debug parameter to 0 or 1.
+
+ If in doubt, say N.
config IP_DCCP_CCID3
- tristate "CCID3 (TFRC) (EXPERIMENTAL)"
- depends on IP_DCCP
+ bool "CCID-3 (TCP-Friendly)"
+ def_bool y if (IP_DCCP = y || IP_DCCP = m)
---help---
- CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
+ CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
rate-controlled congestion control mechanism. TFRC is designed to
be reasonably fair when competing for bandwidth with TCP-like flows,
where a flow is "reasonably fair" if its sending rate is generally
within a factor of two of the sending rate of a TCP flow under the
same conditions. However, TFRC has a much lower variation of
- throughput over time compared with TCP, which makes CCID 3 more
- suitable than CCID 2 for applications such streaming media where a
+ throughput over time compared with TCP, which makes CCID-3 more
+ suitable than CCID-2 for applications such streaming media where a
relatively smooth sending rate is of importance.
- CCID 3 is further described in [CCID 3 PROFILE]. The TFRC
- congestion control algorithms were initially described in RFC 3448.
+ CCID-3 is further described in RFC 4342,
+ http://www.ietf.org/rfc/rfc4342.txt
- This text was extracted from draft-ietf-dccp-spec-11.txt.
-
- If in doubt, say M.
+ The TFRC congestion control algorithms were initially described in
+ RFC 5348.
-config IP_DCCP_TFRC_LIB
+ This text was extracted from RFC 4340 (sec. 10.2),
+ http://www.ietf.org/rfc/rfc4340.txt
+
+ If in doubt, say N.
+
+config IP_DCCP_CCID3_DEBUG
+ bool "CCID-3 debugging messages"
depends on IP_DCCP_CCID3
- def_tristate IP_DCCP_CCID3
+ ---help---
+ Enable CCID-3 specific debugging messages.
+
+ The debugging output can additionally be toggled by setting the
+ ccid3_debug parameter to 0 or 1.
+
+ If in doubt, say N.
+
+config IP_DCCP_TFRC_LIB
+ def_bool y if IP_DCCP_CCID3
+config IP_DCCP_TFRC_DEBUG
+ def_bool y if IP_DCCP_CCID3_DEBUG
endmenu
diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile
deleted file mode 100644
index 956f79f5074..00000000000
--- a/net/dccp/ccids/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o
-
-dccp_ccid3-y := ccid3.o
-
-obj-y += lib/
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
new file mode 100644
index 00000000000..f053198e730
--- /dev/null
+++ b/net/dccp/ccids/ccid2.c
@@ -0,0 +1,784 @@
+/*
+ * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ * Changes to meet Linux coding standards, and DCCP infrastructure fixes.
+ *
+ * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This implementation should follow RFC 4341
+ */
+#include <linux/slab.h>
+#include "../feat.h"
+#include "ccid2.h"
+
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+static bool ccid2_debug;
+#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
+#else
+#define ccid2_pr_debug(format, a...)
+#endif
+
+static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
+{
+ struct ccid2_seq *seqp;
+ int i;
+
+ /* check if we have space to preserve the pointer to the buffer */
+ if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
+ sizeof(struct ccid2_seq *)))
+ return -ENOMEM;
+
+ /* allocate buffer and initialize linked list */
+ seqp = kmalloc(CCID2_SEQBUF_LEN * sizeof(struct ccid2_seq), gfp_any());
+ if (seqp == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) {
+ seqp[i].ccid2s_next = &seqp[i + 1];
+ seqp[i + 1].ccid2s_prev = &seqp[i];
+ }
+ seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp;
+ seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+
+ /* This is the first allocation. Initiate the head and tail. */
+ if (hc->tx_seqbufc == 0)
+ hc->tx_seqh = hc->tx_seqt = seqp;
+ else {
+ /* link the existing list with the one we just created */
+ hc->tx_seqh->ccid2s_next = seqp;
+ seqp->ccid2s_prev = hc->tx_seqh;
+
+ hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
+ seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
+ }
+
+ /* store the original pointer to the buffer so we can free it */
+ hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
+ hc->tx_seqbufc++;
+
+ return 0;
+}
+
+static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
+{
+ if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
+ return CCID_PACKET_WILL_DEQUEUE_LATER;
+ return CCID_PACKET_SEND_AT_ONCE;
+}
+
+static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
+{
+ u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
+
+ /*
+ * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
+ * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
+ * acceptable since this causes starvation/deadlock whenever cwnd < 2.
+ * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
+ */
+ if (val == 0 || val > max_ratio) {
+ DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
+ val = max_ratio;
+ }
+ dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
+ min_t(u32, val, DCCPF_ACK_RATIO_MAX));
+}
+
+static void ccid2_check_l_ack_ratio(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ /*
+ * After a loss, idle period, application limited period, or RTO we
+ * need to check that the ack ratio is still less than the congestion
+ * window. Otherwise, we will send an entire congestion window of
+ * packets and got no response because we haven't sent ack ratio
+ * packets yet.
+ * If the ack ratio does need to be reduced, we reduce it to half of
+ * the congestion window (or 1 if that's zero) instead of to the
+ * congestion window. This prevents problems if one ack is lost.
+ */
+ if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
+ ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
+}
+
+static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
+{
+ dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
+ clamp_val(val, DCCPF_SEQ_WMIN,
+ DCCPF_SEQ_WMAX));
+}
+
+static void ccid2_hc_tx_rto_expire(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
+ goto out;
+ }
+
+ ccid2_pr_debug("RTO_EXPIRE\n");
+
+ /* back-off timer */
+ hc->tx_rto <<= 1;
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
+
+ /* adjust pipe, cwnd etc */
+ hc->tx_ssthresh = hc->tx_cwnd / 2;
+ if (hc->tx_ssthresh < 2)
+ hc->tx_ssthresh = 2;
+ hc->tx_cwnd = 1;
+ hc->tx_pipe = 0;
+
+ /* clear state about stuff we sent */
+ hc->tx_seqt = hc->tx_seqh;
+ hc->tx_packets_acked = 0;
+
+ /* clear ack ratio state. */
+ hc->tx_rpseq = 0;
+ hc->tx_rpdupack = -1;
+ ccid2_change_l_ack_ratio(sk, 1);
+
+ /* if we were blocked before, we may now send cwnd=1 packet */
+ if (sender_was_blocked)
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ /* restart backed-off timer */
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/*
+ * Congestion window validation (RFC 2861).
+ */
+static bool ccid2_do_cwv = true;
+module_param(ccid2_do_cwv, bool, 0644);
+MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
+
+/**
+ * ccid2_update_used_window - Track how much of cwnd is actually used
+ * This is done in addition to CWV. The sender needs to have an idea of how many
+ * packets may be in flight, to set the local Sequence Window value accordingly
+ * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
+ * maximum-used window. We use an EWMA low-pass filter to filter out noise.
+ */
+static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
+{
+ hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
+}
+
+/* This borrows the code of tcp_cwnd_application_limited() */
+static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ /* don't reduce cwnd below the initial window (IW) */
+ u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
+ win_used = max(hc->tx_cwnd_used, init_win);
+
+ if (win_used < hc->tx_cwnd) {
+ hc->tx_ssthresh = max(hc->tx_ssthresh,
+ (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
+ hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
+ }
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+/* This borrows the code of tcp_cwnd_restart() */
+static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ u32 cwnd = hc->tx_cwnd, restart_cwnd,
+ iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
+
+ hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
+
+ /* don't reduce cwnd below the initial window (IW) */
+ restart_cwnd = min(cwnd, iwnd);
+ cwnd >>= (now - hc->tx_lsndtime) / hc->tx_rto;
+ hc->tx_cwnd = max(cwnd, restart_cwnd);
+
+ hc->tx_cwnd_stamp = now;
+ hc->tx_cwnd_used = 0;
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const u32 now = ccid2_time_stamp;
+ struct ccid2_seq *next;
+
+ /* slow-start after idle periods (RFC 2581, RFC 2861) */
+ if (ccid2_do_cwv && !hc->tx_pipe &&
+ (s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
+ ccid2_cwnd_restart(sk, now);
+
+ hc->tx_lsndtime = now;
+ hc->tx_pipe += 1;
+
+ /* see whether cwnd was fully used (RFC 2861), update expected window */
+ if (ccid2_cwnd_network_limited(hc)) {
+ ccid2_update_used_window(hc, hc->tx_cwnd);
+ hc->tx_cwnd_used = 0;
+ hc->tx_cwnd_stamp = now;
+ } else {
+ if (hc->tx_pipe > hc->tx_cwnd_used)
+ hc->tx_cwnd_used = hc->tx_pipe;
+
+ ccid2_update_used_window(hc, hc->tx_cwnd_used);
+
+ if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
+ ccid2_cwnd_application_limited(sk, now);
+ }
+
+ hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
+ hc->tx_seqh->ccid2s_acked = 0;
+ hc->tx_seqh->ccid2s_sent = now;
+
+ next = hc->tx_seqh->ccid2s_next;
+ /* check if we need to alloc more space */
+ if (next == hc->tx_seqt) {
+ if (ccid2_hc_tx_alloc_seq(hc)) {
+ DCCP_CRIT("packet history - out of memory!");
+ /* FIXME: find a more graceful way to bail out */
+ return;
+ }
+ next = hc->tx_seqh->ccid2s_next;
+ BUG_ON(next == hc->tx_seqt);
+ }
+ hc->tx_seqh = next;
+
+ ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
+
+ /*
+ * FIXME: The code below is broken and the variables have been removed
+ * from the socket struct. The `ackloss' variable was always set to 0,
+ * and with arsent there are several problems:
+ * (i) it doesn't just count the number of Acks, but all sent packets;
+ * (ii) it is expressed in # of packets, not # of windows, so the
+ * comparison below uses the wrong formula: Appendix A of RFC 4341
+ * comes up with the number K = cwnd / (R^2 - R) of consecutive windows
+ * of data with no lost or marked Ack packets. If arsent were the # of
+ * consecutive Acks received without loss, then Ack Ratio needs to be
+ * decreased by 1 when
+ * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2)
+ * where cwnd / R is the number of Acks received per window of data
+ * (cf. RFC 4341, App. A). The problems are that
+ * - arsent counts other packets as well;
+ * - the comparison uses a formula different from RFC 4341;
+ * - computing a cubic/quadratic equation each time is too complicated.
+ * Hence a different algorithm is needed.
+ */
+#if 0
+ /* Ack Ratio. Need to maintain a concept of how many windows we sent */
+ hc->tx_arsent++;
+ /* We had an ack loss in this window... */
+ if (hc->tx_ackloss) {
+ if (hc->tx_arsent >= hc->tx_cwnd) {
+ hc->tx_arsent = 0;
+ hc->tx_ackloss = 0;
+ }
+ } else {
+ /* No acks lost up to now... */
+ /* decrease ack ratio if enough packets were sent */
+ if (dp->dccps_l_ack_ratio > 1) {
+ /* XXX don't calculate denominator each time */
+ int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
+ dp->dccps_l_ack_ratio;
+
+ denom = hc->tx_cwnd * hc->tx_cwnd / denom;
+
+ if (hc->tx_arsent >= denom) {
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
+ hc->tx_arsent = 0;
+ }
+ } else {
+ /* we can't increase ack ratio further [1] */
+ hc->tx_arsent = 0; /* or maybe set it to cwnd*/
+ }
+ }
+#endif
+
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+ do {
+ struct ccid2_seq *seqp = hc->tx_seqt;
+
+ while (seqp != hc->tx_seqh) {
+ ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
+ (unsigned long long)seqp->ccid2s_seq,
+ seqp->ccid2s_acked, seqp->ccid2s_sent);
+ seqp = seqp->ccid2s_next;
+ }
+ } while (0);
+ ccid2_pr_debug("=========\n");
+#endif
+}
+
+/**
+ * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
+ * This code is almost identical with TCP's tcp_rtt_estimator(), since
+ * - it has a higher sampling frequency (recommended by RFC 1323),
+ * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
+ * - it is simple (cf. more complex proposals such as Eifel timer or research
+ * which suggests that the gain should be set according to window size),
+ * - in tests it was found to work well with CCID2 [gerrit].
+ */
+static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ long m = mrtt ? : 1;
+
+ if (hc->tx_srtt == 0) {
+ /* First measurement m */
+ hc->tx_srtt = m << 3;
+ hc->tx_mdev = m << 1;
+
+ hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
+ hc->tx_rttvar = hc->tx_mdev_max;
+
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ } else {
+ /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
+ m -= (hc->tx_srtt >> 3);
+ hc->tx_srtt += m;
+
+ /* Similarly, update scaled mdev with regard to |m| */
+ if (m < 0) {
+ m = -m;
+ m -= (hc->tx_mdev >> 2);
+ /*
+ * This neutralises RTO increase when RTT < SRTT - mdev
+ * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
+ * in Linux TCP", USENIX 2002, pp. 49-62).
+ */
+ if (m > 0)
+ m >>= 3;
+ } else {
+ m -= (hc->tx_mdev >> 2);
+ }
+ hc->tx_mdev += m;
+
+ if (hc->tx_mdev > hc->tx_mdev_max) {
+ hc->tx_mdev_max = hc->tx_mdev;
+ if (hc->tx_mdev_max > hc->tx_rttvar)
+ hc->tx_rttvar = hc->tx_mdev_max;
+ }
+
+ /*
+ * Decay RTTVAR at most once per flight, exploiting that
+ * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
+ * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
+ * GAR is a useful bound for FlightSize = pipe.
+ * AWL is probably too low here, as it over-estimates pipe.
+ */
+ if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
+ if (hc->tx_mdev_max < hc->tx_rttvar)
+ hc->tx_rttvar -= (hc->tx_rttvar -
+ hc->tx_mdev_max) >> 2;
+ hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
+ hc->tx_mdev_max = tcp_rto_min(sk);
+ }
+ }
+
+ /*
+ * Set RTO from SRTT and RTTVAR
+ * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
+ * This agrees with RFC 4341, 5:
+ * "Because DCCP does not retransmit data, DCCP does not require
+ * TCP's recommended minimum timeout of one second".
+ */
+ hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
+
+ if (hc->tx_rto > DCCP_RTO_MAX)
+ hc->tx_rto = DCCP_RTO_MAX;
+}
+
+static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
+ unsigned int *maxincr)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
+
+ if (hc->tx_cwnd < dp->dccps_l_seq_win &&
+ r_seq_used < dp->dccps_r_seq_win) {
+ if (hc->tx_cwnd < hc->tx_ssthresh) {
+ if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
+ hc->tx_cwnd += 1;
+ *maxincr -= 1;
+ hc->tx_packets_acked = 0;
+ }
+ } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
+ hc->tx_cwnd += 1;
+ hc->tx_packets_acked = 0;
+ }
+ }
+
+ /*
+ * Adjust the local sequence window and the ack ratio to allow about
+ * 5 times the number of packets in the network (RFC 4340 7.5.2)
+ */
+ if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
+ else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
+ ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
+
+ if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
+ else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
+ ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
+
+ /*
+ * FIXME: RTT is sampled several times per acknowledgment (for each
+ * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
+ * This causes the RTT to be over-estimated, since the older entries
+ * in the Ack Vector have earlier sending times.
+ * The cleanest solution is to not use the ccid2s_sent field at all
+ * and instead use DCCP timestamps: requires changes in other places.
+ */
+ ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent);
+}
+
+static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
+ ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
+ return;
+ }
+
+ hc->tx_last_cong = ccid2_time_stamp;
+
+ hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
+ hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
+
+ ccid2_check_l_ack_ratio(sk);
+}
+
+static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+
+ switch (option) {
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
+ option - DCCPO_ACK_VECTOR_0);
+ }
+ return 0;
+}
+
+static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
+ struct dccp_ackvec_parsed *avp;
+ u64 ackno, seqno;
+ struct ccid2_seq *seqp;
+ int done = 0;
+ unsigned int maxincr = 0;
+
+ /* check reverse path congestion */
+ seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ /* XXX this whole "algorithm" is broken. Need to fix it to keep track
+ * of the seqnos of the dupacks so that rpseq and rpdupack are correct
+ * -sorbo.
+ */
+ /* need to bootstrap */
+ if (hc->tx_rpdupack == -1) {
+ hc->tx_rpdupack = 0;
+ hc->tx_rpseq = seqno;
+ } else {
+ /* check if packet is consecutive */
+ if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
+ hc->tx_rpseq = seqno;
+ /* it's a later packet */
+ else if (after48(seqno, hc->tx_rpseq)) {
+ hc->tx_rpdupack++;
+
+ /* check if we got enough dupacks */
+ if (hc->tx_rpdupack >= NUMDUPACK) {
+ hc->tx_rpdupack = -1; /* XXX lame */
+ hc->tx_rpseq = 0;
+#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
+ /*
+ * FIXME: Ack Congestion Control is broken; in
+ * the current state instabilities occurred with
+ * Ack Ratios greater than 1; causing hang-ups
+ * and long RTO timeouts. This needs to be fixed
+ * before opening up dynamic changes. -- gerrit
+ */
+ ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
+#endif
+ }
+ }
+ }
+
+ /* check forward path congestion */
+ if (dccp_packet_without_ack(skb))
+ return;
+
+ /* still didn't send out new data packets */
+ if (hc->tx_seqh == hc->tx_seqt)
+ goto done;
+
+ ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+ if (after48(ackno, hc->tx_high_ack))
+ hc->tx_high_ack = ackno;
+
+ seqp = hc->tx_seqt;
+ while (before48(seqp->ccid2s_seq, ackno)) {
+ seqp = seqp->ccid2s_next;
+ if (seqp == hc->tx_seqh) {
+ seqp = hc->tx_seqh->ccid2s_prev;
+ break;
+ }
+ }
+
+ /*
+ * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
+ * packets per acknowledgement. Rounding up avoids that cwnd is not
+ * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
+ */
+ if (hc->tx_cwnd < hc->tx_ssthresh)
+ maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
+
+ /* go through all ack vectors */
+ list_for_each_entry(avp, &hc->tx_av_chunks, node) {
+ /* go through this ack vector */
+ for (; avp->len--; avp->vec++) {
+ u64 ackno_end_rl = SUB48(ackno,
+ dccp_ackvec_runlen(avp->vec));
+
+ ccid2_pr_debug("ackvec %llu |%u,%u|\n",
+ (unsigned long long)ackno,
+ dccp_ackvec_state(avp->vec) >> 6,
+ dccp_ackvec_runlen(avp->vec));
+ /* if the seqno we are analyzing is larger than the
+ * current ackno, then move towards the tail of our
+ * seqnos.
+ */
+ while (after48(seqp->ccid2s_seq, ackno)) {
+ if (seqp == hc->tx_seqt) {
+ done = 1;
+ break;
+ }
+ seqp = seqp->ccid2s_prev;
+ }
+ if (done)
+ break;
+
+ /* check all seqnos in the range of the vector
+ * run length
+ */
+ while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
+ const u8 state = dccp_ackvec_state(avp->vec);
+
+ /* new packet received or marked */
+ if (state != DCCPAV_NOT_RECEIVED &&
+ !seqp->ccid2s_acked) {
+ if (state == DCCPAV_ECN_MARKED)
+ ccid2_congestion_event(sk,
+ seqp);
+ else
+ ccid2_new_ack(sk, seqp,
+ &maxincr);
+
+ seqp->ccid2s_acked = 1;
+ ccid2_pr_debug("Got ack for %llu\n",
+ (unsigned long long)seqp->ccid2s_seq);
+ hc->tx_pipe--;
+ }
+ if (seqp == hc->tx_seqt) {
+ done = 1;
+ break;
+ }
+ seqp = seqp->ccid2s_prev;
+ }
+ if (done)
+ break;
+
+ ackno = SUB48(ackno_end_rl, 1);
+ }
+ if (done)
+ break;
+ }
+
+ /* The state about what is acked should be correct now
+ * Check for NUMDUPACK
+ */
+ seqp = hc->tx_seqt;
+ while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
+ seqp = seqp->ccid2s_next;
+ if (seqp == hc->tx_seqh) {
+ seqp = hc->tx_seqh->ccid2s_prev;
+ break;
+ }
+ }
+ done = 0;
+ while (1) {
+ if (seqp->ccid2s_acked) {
+ done++;
+ if (done == NUMDUPACK)
+ break;
+ }
+ if (seqp == hc->tx_seqt)
+ break;
+ seqp = seqp->ccid2s_prev;
+ }
+
+ /* If there are at least 3 acknowledgements, anything unacknowledged
+ * below the last sequence number is considered lost
+ */
+ if (done == NUMDUPACK) {
+ struct ccid2_seq *last_acked = seqp;
+
+ /* check for lost packets */
+ while (1) {
+ if (!seqp->ccid2s_acked) {
+ ccid2_pr_debug("Packet lost: %llu\n",
+ (unsigned long long)seqp->ccid2s_seq);
+ /* XXX need to traverse from tail -> head in
+ * order to detect multiple congestion events in
+ * one ack vector.
+ */
+ ccid2_congestion_event(sk, seqp);
+ hc->tx_pipe--;
+ }
+ if (seqp == hc->tx_seqt)
+ break;
+ seqp = seqp->ccid2s_prev;
+ }
+
+ hc->tx_seqt = last_acked;
+ }
+
+ /* trim acked packets in tail */
+ while (hc->tx_seqt != hc->tx_seqh) {
+ if (!hc->tx_seqt->ccid2s_acked)
+ break;
+
+ hc->tx_seqt = hc->tx_seqt->ccid2s_next;
+ }
+
+ /* restart RTO timer if not all outstanding data has been acked */
+ if (hc->tx_pipe == 0)
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+ else
+ sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
+done:
+ /* check if incoming Acks allow pending packets to be sent */
+ if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
+ tasklet_schedule(&dccp_sk(sk)->dccps_xmitlet);
+ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
+}
+
+static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 max_ratio;
+
+ /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
+ hc->tx_ssthresh = ~0U;
+
+ /* Use larger initial windows (RFC 4341, section 5). */
+ hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
+ hc->tx_expected_wnd = hc->tx_cwnd;
+
+ /* Make sure that Ack Ratio is enabled and within bounds. */
+ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
+ if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
+ dp->dccps_l_ack_ratio = max_ratio;
+
+ /* XXX init ~ to window size... */
+ if (ccid2_hc_tx_alloc_seq(hc))
+ return -ENOMEM;
+
+ hc->tx_rto = DCCP_TIMEOUT_INIT;
+ hc->tx_rpdupack = -1;
+ hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_time_stamp;
+ hc->tx_cwnd_used = 0;
+ setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire,
+ (unsigned long)sk);
+ INIT_LIST_HEAD(&hc->tx_av_chunks);
+ return 0;
+}
+
+static void ccid2_hc_tx_exit(struct sock *sk)
+{
+ struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
+ int i;
+
+ sk_stop_timer(sk, &hc->tx_rtotimer);
+
+ for (i = 0; i < hc->tx_seqbufc; i++)
+ kfree(hc->tx_seqbuf[i]);
+ hc->tx_seqbufc = 0;
+}
+
+static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
+
+ if (!dccp_data_packet(skb))
+ return;
+
+ if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
+ dccp_send_ack(sk);
+ hc->rx_num_data_pkts = 0;
+ }
+}
+
+struct ccid_operations ccid2_ops = {
+ .ccid_id = DCCPC_CCID2,
+ .ccid_name = "TCP-like",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
+ .ccid_hc_tx_init = ccid2_hc_tx_init,
+ .ccid_hc_tx_exit = ccid2_hc_tx_exit,
+ .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
+ .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
+ .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
+ .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
+ .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
+};
+
+#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
+module_param(ccid2_debug, bool, 0644);
+MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
+#endif
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
new file mode 100644
index 00000000000..18c97543e52
--- /dev/null
+++ b/net/dccp/ccids/ccid2.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _DCCP_CCID2_H_
+#define _DCCP_CCID2_H_
+
+#include <linux/timer.h>
+#include <linux/types.h>
+#include "../ccid.h"
+#include "../dccp.h"
+
+/*
+ * CCID-2 timestamping faces the same issues as TCP timestamping.
+ * Hence we reuse/share as much of the code as possible.
+ */
+#define ccid2_time_stamp tcp_time_stamp
+
+/* NUMDUPACK parameter from RFC 4341, p. 6 */
+#define NUMDUPACK 3
+
+struct ccid2_seq {
+ u64 ccid2s_seq;
+ u32 ccid2s_sent;
+ int ccid2s_acked;
+ struct ccid2_seq *ccid2s_prev;
+ struct ccid2_seq *ccid2s_next;
+};
+
+#define CCID2_SEQBUF_LEN 1024
+#define CCID2_SEQBUF_MAX 128
+
+/*
+ * Multiple of congestion window to keep the sequence window at
+ * (RFC 4340 7.5.2)
+ */
+#define CCID2_WIN_CHANGE_FACTOR 5
+
+/**
+ * struct ccid2_hc_tx_sock - CCID2 TX half connection
+ * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
+ * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
+ * @tx_srtt: smoothed RTT estimate, scaled by 2^3
+ * @tx_mdev: smoothed RTT variation, scaled by 2^2
+ * @tx_mdev_max: maximum of @mdev during one flight
+ * @tx_rttvar: moving average/maximum of @mdev_max
+ * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
+ * @tx_rtt_seq: to decay RTTVAR at most once per flight
+ * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
+ * @tx_expected_wnd: moving average of @tx_cwnd_used
+ * @tx_cwnd_stamp: to track idle periods in CWV
+ * @tx_lsndtime: last time (in jiffies) a data packet was sent
+ * @tx_rpseq: last consecutive seqno
+ * @tx_rpdupack: dupacks since rpseq
+ * @tx_av_chunks: list of Ack Vectors received on current skb
+ */
+struct ccid2_hc_tx_sock {
+ u32 tx_cwnd;
+ u32 tx_ssthresh;
+ u32 tx_pipe;
+ u32 tx_packets_acked;
+ struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX];
+ int tx_seqbufc;
+ struct ccid2_seq *tx_seqh;
+ struct ccid2_seq *tx_seqt;
+
+ /* RTT measurement: variables/principles are the same as in TCP */
+ u32 tx_srtt,
+ tx_mdev,
+ tx_mdev_max,
+ tx_rttvar,
+ tx_rto;
+ u64 tx_rtt_seq:48;
+ struct timer_list tx_rtotimer;
+
+ /* Congestion Window validation (optional, RFC 2861) */
+ u32 tx_cwnd_used,
+ tx_expected_wnd,
+ tx_cwnd_stamp,
+ tx_lsndtime;
+
+ u64 tx_rpseq;
+ int tx_rpdupack;
+ u32 tx_last_cong;
+ u64 tx_high_ack;
+ struct list_head tx_av_chunks;
+};
+
+static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
+{
+ return hc->tx_pipe >= hc->tx_cwnd;
+}
+
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ * This is based on the numbers specified in RFC 5681, 3.1.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+ return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
+}
+
+/**
+ * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
+ * @rx_num_data_pkts: number of data packets received since last feedback
+ */
+struct ccid2_hc_rx_sock {
+ u32 rx_num_data_pkts;
+};
+
+static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
+{
+ return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+}
+
+static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
+{
+ return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+}
+#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index aa68e0ab274..119c04317d4 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -1,8 +1,7 @@
/*
- * net/dccp/ccids/ccid3.c
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* An implementation of the DCCP protocol
*
@@ -33,1109 +32,806 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-
-#include <linux/config.h>
-#include "../ccid.h"
#include "../dccp.h"
-#include "lib/packet_history.h"
-#include "lib/loss_interval.h"
-#include "lib/tfrc.h"
#include "ccid3.h"
-/*
- * Reason for maths here is to avoid 32 bit overflow when a is big.
- * With this we get close to the limit.
- */
-static inline u32 usecs_div(const u32 a, const u32 b)
-{
- const u32 div = a < (UINT_MAX / (USEC_PER_SEC / 10)) ? 10 :
- a < (UINT_MAX / (USEC_PER_SEC / 50)) ? 50 :
- a < (UINT_MAX / (USEC_PER_SEC / 100)) ? 100 :
- a < (UINT_MAX / (USEC_PER_SEC / 500)) ? 500 :
- a < (UINT_MAX / (USEC_PER_SEC / 1000)) ? 1000 :
- a < (UINT_MAX / (USEC_PER_SEC / 5000)) ? 5000 :
- a < (UINT_MAX / (USEC_PER_SEC / 10000)) ? 10000 :
- a < (UINT_MAX / (USEC_PER_SEC / 50000)) ? 50000 :
- 100000;
- const u32 tmp = a * (USEC_PER_SEC / div);
- return (b >= 2 * div) ? tmp / (b / div) : tmp;
-}
+#include <asm/unaligned.h>
-static int ccid3_debug;
-
-#ifdef CCID3_DEBUG
-#define ccid3_pr_debug(format, a...) \
- do { if (ccid3_debug) \
- printk(KERN_DEBUG "%s: " format, __FUNCTION__, ##a); \
- } while (0)
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+static bool ccid3_debug;
+#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
#else
#define ccid3_pr_debug(format, a...)
#endif
-static struct dccp_tx_hist *ccid3_tx_hist;
-static struct dccp_rx_hist *ccid3_rx_hist;
-static struct dccp_li_hist *ccid3_li_hist;
-
-static int ccid3_init(struct sock *sk)
-{
- return 0;
-}
-
-static void ccid3_exit(struct sock *sk)
-{
-}
-
-/* TFRC sender states */
-enum ccid3_hc_tx_states {
- TFRC_SSTATE_NO_SENT = 1,
- TFRC_SSTATE_NO_FBACK,
- TFRC_SSTATE_FBACK,
- TFRC_SSTATE_TERM,
-};
-
-#ifdef CCID3_DEBUG
+/*
+ * Transmitter Half-Connection Routines
+ */
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
{
- static char *ccid3_state_names[] = {
+ static const char *const ccid3_state_names[] = {
[TFRC_SSTATE_NO_SENT] = "NO_SENT",
[TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
[TFRC_SSTATE_FBACK] = "FBACK",
- [TFRC_SSTATE_TERM] = "TERM",
};
return ccid3_state_names[state];
}
#endif
-static inline void ccid3_hc_tx_set_state(struct sock *sk,
- enum ccid3_hc_tx_states state)
+static void ccid3_hc_tx_set_state(struct sock *sk,
+ enum ccid3_hc_tx_states state)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- enum ccid3_hc_tx_states oldstate = hctx->ccid3hctx_state;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ enum ccid3_hc_tx_states oldstate = hc->tx_state;
ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
ccid3_tx_state_name(state));
WARN_ON(state == oldstate);
- hctx->ccid3hctx_state = state;
+ hc->tx_state = state;
}
-/* Calculate new t_ipi (inter packet interval) by t_ipi = s / X_inst */
-static inline void ccid3_calc_new_t_ipi(struct ccid3_hc_tx_sock *hctx)
+/*
+ * Compute the initial sending rate X_init in the manner of RFC 3390:
+ *
+ * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
+ *
+ * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
+ * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
+ * For consistency with other parts of the code, X_init is scaled by 2^6.
+ */
+static inline u64 rfc3390_initial_rate(struct sock *sk)
+{
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
+
+ return scaled_div(w_init << 6, hc->tx_rtt);
+}
+
+/**
+ * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
+ * This respects the granularity of X_inst (64 * bytes/second).
+ */
+static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
+{
+ hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
+
+ DCCP_BUG_ON(hc->tx_t_ipi == 0);
+ ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
+ hc->tx_s, (unsigned int)(hc->tx_x >> 6));
+}
+
+static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
+{
+ u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
+
+ return delta / hc->tx_rtt;
+}
+
+/**
+ * ccid3_hc_tx_update_x - Update allowed sending rate X
+ * @stamp: most recent time if available - can be left NULL.
+ *
+ * This function tracks draft rfc3448bis, check there for latest details.
+ *
+ * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
+ * fine-grained resolution of sending rates. This requires scaling by 2^6
+ * throughout the code. Only X_calc is unscaled (in bytes/second).
+ *
+ */
+static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ __u64 min_rate = 2 * hc->tx_x_recv;
+ const __u64 old_x = hc->tx_x;
+ ktime_t now = stamp ? *stamp : ktime_get_real();
+
/*
- * If no feedback spec says t_ipi is 1 second (set elsewhere and then
- * doubles after every no feedback timer (separate function)
+ * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
+ * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
+ * a sender is idle if it has not sent anything over a 2-RTT-period.
+ * For consistency with X and X_recv, min_rate is also scaled by 2^6.
*/
- if (hctx->ccid3hctx_state != TFRC_SSTATE_NO_FBACK)
- hctx->ccid3hctx_t_ipi = usecs_div(hctx->ccid3hctx_s,
- hctx->ccid3hctx_x);
+ if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
+ min_rate = rfc3390_initial_rate(sk);
+ min_rate = max(min_rate, 2 * hc->tx_x_recv);
+ }
+
+ if (hc->tx_p > 0) {
+
+ hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
+ hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+
+ } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
+
+ hc->tx_x = min(2 * hc->tx_x, min_rate);
+ hc->tx_x = max(hc->tx_x,
+ scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
+ hc->tx_t_ld = now;
+ }
+
+ if (hc->tx_x != old_x) {
+ ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
+ "X_recv=%u\n", (unsigned int)(old_x >> 6),
+ (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6));
+
+ ccid3_update_send_interval(hc);
+ }
}
-/* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */
-static inline void ccid3_calc_new_delta(struct ccid3_hc_tx_sock *hctx)
+/**
+ * ccid3_hc_tx_update_s - Track the mean packet size `s'
+ * @len: DCCP packet payload size in bytes
+ *
+ * cf. RFC 4342, 5.3 and RFC 3448, 4.1
+ */
+static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
{
- hctx->ccid3hctx_delta = min_t(u32, hctx->ccid3hctx_t_ipi / 2,
- TFRC_OPSYS_HALF_TIME_GRAN);
+ const u16 old_s = hc->tx_s;
+
+ hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
+
+ if (hc->tx_s != old_s)
+ ccid3_update_send_interval(hc);
}
/*
- * Update X by
- * If (p > 0)
- * x_calc = calcX(s, R, p);
- * X = max(min(X_calc, 2 * X_recv), s / t_mbi);
- * Else
- * If (now - tld >= R)
- * X = max(min(2 * X, 2 * X_recv), s / R);
- * tld = now;
- */
-static void ccid3_hc_tx_update_x(struct sock *sk)
+ * Update Window Counter using the algorithm from [RFC 4342, 8.1].
+ * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
+ */
+static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
+ ktime_t now)
{
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-
- /* To avoid large error in calcX */
- if (hctx->ccid3hctx_p >= TFRC_SMALLEST_P) {
- hctx->ccid3hctx_x_calc = tfrc_calc_x(hctx->ccid3hctx_s,
- hctx->ccid3hctx_rtt,
- hctx->ccid3hctx_p);
- hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_calc,
- 2 * hctx->ccid3hctx_x_recv),
- (hctx->ccid3hctx_s /
- TFRC_MAX_BACK_OFF_TIME));
- } else {
- struct timeval now;
-
- dccp_timestamp(sk, &now);
- if (timeval_delta(&now, &hctx->ccid3hctx_t_ld) >=
- hctx->ccid3hctx_rtt) {
- hctx->ccid3hctx_x = max_t(u32, min_t(u32, hctx->ccid3hctx_x_recv,
- hctx->ccid3hctx_x) * 2,
- usecs_div(hctx->ccid3hctx_s,
- hctx->ccid3hctx_rtt));
- hctx->ccid3hctx_t_ld = now;
- }
+ u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
+ quarter_rtts = (4 * delta) / hc->tx_rtt;
+
+ if (quarter_rtts > 0) {
+ hc->tx_t_last_win_count = now;
+ hc->tx_last_win_count += min(quarter_rtts, 5U);
+ hc->tx_last_win_count &= 0xF; /* mod 16 */
}
}
static void ccid3_hc_tx_no_feedback_timer(unsigned long data)
{
struct sock *sk = (struct sock *)data;
- unsigned long next_tmout = 0;
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ unsigned long t_nfb = USEC_PER_SEC / 5;
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
/* XXX: set some sensible MIB */
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + HZ / 5);
- goto out;
+ goto restart_timer;
}
- ccid3_pr_debug("%s, sk=%p, state=%s\n", dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state));
-
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_TERM:
+ ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
+ ccid3_tx_state_name(hc->tx_state));
+
+ /* Ignore and do not restart after leaving the established state */
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
goto out;
- case TFRC_SSTATE_NO_FBACK:
- /* Halve send rate */
- hctx->ccid3hctx_x /= 2;
- if (hctx->ccid3hctx_x < (hctx->ccid3hctx_s /
- TFRC_MAX_BACK_OFF_TIME))
- hctx->ccid3hctx_x = (hctx->ccid3hctx_s /
- TFRC_MAX_BACK_OFF_TIME);
-
- ccid3_pr_debug("%s, sk=%p, state=%s, updated tx rate to %d "
- "bytes/s\n",
- dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state),
- hctx->ccid3hctx_x);
- next_tmout = max_t(u32, 2 * usecs_div(hctx->ccid3hctx_s,
- hctx->ccid3hctx_x),
- TFRC_INITIAL_TIMEOUT);
- /*
- * FIXME - not sure above calculation is correct. See section
- * 5 of CCID3 11 should adjust tx_t_ipi and double that to
- * achieve it really
- */
- break;
- case TFRC_SSTATE_FBACK:
+
+ /* Reset feedback state to "no feedback received" */
+ if (hc->tx_state == TFRC_SSTATE_FBACK)
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
+
+ /*
+ * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
+ * RTO is 0 if and only if no feedback has been received yet.
+ */
+ if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
+
+ /* halve send rate directly */
+ hc->tx_x = max(hc->tx_x / 2,
+ (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
+ ccid3_update_send_interval(hc);
+ } else {
/*
- * Check if IDLE since last timeout and recv rate is less than
- * 4 packets per RTT
+ * Modify the cached value of X_recv
+ *
+ * If (X_calc > 2 * X_recv)
+ * X_recv = max(X_recv / 2, s / (2 * t_mbi));
+ * Else
+ * X_recv = X_calc / 4;
+ *
+ * Note that X_recv is scaled by 2^6 while X_calc is not
*/
- if (!hctx->ccid3hctx_idle ||
- (hctx->ccid3hctx_x_recv >=
- 4 * usecs_div(hctx->ccid3hctx_s, hctx->ccid3hctx_rtt))) {
- ccid3_pr_debug("%s, sk=%p, state=%s, not idle\n",
- dccp_role(sk), sk,
- ccid3_tx_state_name(hctx->ccid3hctx_state));
- /* Halve sending rate */
-
- /* If (X_calc > 2 * X_recv)
- * X_recv = max(X_recv / 2, s / (2 * t_mbi));
- * Else
- * X_recv = X_calc / 4;
- */
- BUG_ON(hctx->ccid3hctx_p >= TFRC_SMALLEST_P &&
- hctx->ccid3hctx_x_calc == 0);
-
- /* check also if p is zero -> x_calc is infinity? */
- if (hctx->ccid3hctx_p < TFRC_SMALLEST_P ||
- hctx->ccid3hctx_x_calc > 2 * hctx->ccid3hctx_x_recv)
- hctx->ccid3hctx_x_recv = max_t(u32, hctx->ccid3hctx_x_recv / 2,
- hctx->ccid3hctx_s / (2 * TFRC_MAX_BACK_OFF_TIME));
- else
- hctx->ccid3hctx_x_recv = hctx->ccid3hctx_x_calc / 4;
-
- /* Update sending rate */
- ccid3_hc_tx_update_x(sk);
+ if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
+ hc->tx_x_recv =
+ max(hc->tx_x_recv / 2,
+ (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
+ else {
+ hc->tx_x_recv = hc->tx_x_calc;
+ hc->tx_x_recv <<= 4;
}
- /*
- * Schedule no feedback timer to expire in
- * max(4 * R, 2 * s / X)
- */
- next_tmout = max_t(u32, hctx->ccid3hctx_t_rto,
- 2 * usecs_div(hctx->ccid3hctx_s,
- hctx->ccid3hctx_x));
- break;
- default:
- printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
- __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
- dump_stack();
- goto out;
+ ccid3_hc_tx_update_x(sk, NULL);
}
+ ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
+ (unsigned long long)hc->tx_x);
+
+ /*
+ * Set new timeout for the nofeedback timer.
+ * See comments in packet_recv() regarding the value of t_RTO.
+ */
+ if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
+ t_nfb = TFRC_INITIAL_TIMEOUT;
+ else
+ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
- hctx->ccid3hctx_idle = 1;
+restart_timer:
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer,
+ jiffies + usecs_to_jiffies(t_nfb));
out:
bh_unlock_sock(sk);
sock_put(sk);
}
-static int ccid3_hc_tx_send_packet(struct sock *sk,
- struct sk_buff *skb, int len)
+/**
+ * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
+ * @skb: next packet candidate to send on @sk
+ *
+ * This function uses the convention of ccid_packet_dequeue_eval() and
+ * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
+ */
+static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct dccp_tx_hist_entry *new_packet;
- struct timeval now;
- long delay;
- int rc = -ENOTCONN;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ ktime_t now = ktime_get_real();
+ s64 delay;
- BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM);
-
- /* Check if pure ACK or Terminating*/
/*
- * XXX: We only call this function for DATA and DATAACK, on, these
- * packets can have zero length, but why the comment about "pure ACK"?
+ * This function is called only for Data and DataAck packets. Sending
+ * zero-sized Data(Ack)s is theoretically possible, but for congestion
+ * control this case is pathological - ignore it.
*/
- if (unlikely(len == 0))
- goto out;
+ if (unlikely(skb->len == 0))
+ return -EBADMSG;
- /* See if last packet allocated was not sent */
- new_packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
- if (new_packet == NULL || new_packet->dccphtx_sent) {
- new_packet = dccp_tx_hist_entry_new(ccid3_tx_hist,
- SLAB_ATOMIC);
-
- rc = -ENOBUFS;
- if (unlikely(new_packet == NULL)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, not enough "
- "mem to add to history, send refused\n",
- __FUNCTION__, dccp_role(sk), sk);
- goto out;
- }
+ if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
+ usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
+ hc->tx_last_win_count = 0;
+ hc->tx_t_last_win_count = now;
- dccp_tx_hist_add_entry(&hctx->ccid3hctx_hist, new_packet);
- }
+ /* Set t_0 for initial packet */
+ hc->tx_t_nom = now;
- dccp_timestamp(sk, &now);
+ hc->tx_s = skb->len;
+
+ /*
+ * Use initial RTT sample when available: recommended by erratum
+ * to RFC 4342. This implements the initialisation procedure of
+ * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
+ */
+ if (dp->dccps_syn_rtt) {
+ ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
+ hc->tx_rtt = dp->dccps_syn_rtt;
+ hc->tx_x = rfc3390_initial_rate(sk);
+ hc->tx_t_ld = now;
+ } else {
+ /*
+ * Sender does not have RTT sample:
+ * - set fallback RTT (RFC 4340, 3.4) since a RTT value
+ * is needed in several parts (e.g. window counter);
+ * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
+ */
+ hc->tx_rtt = DCCP_FALLBACK_RTT;
+ hc->tx_x = hc->tx_s;
+ hc->tx_x <<= 6;
+ }
+ ccid3_update_send_interval(hc);
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_SENT:
- hctx->ccid3hctx_no_feedback_timer.function = ccid3_hc_tx_no_feedback_timer;
- hctx->ccid3hctx_no_feedback_timer.data = (unsigned long)sk;
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT));
- hctx->ccid3hctx_last_win_count = 0;
- hctx->ccid3hctx_t_last_win_count = now;
ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
- hctx->ccid3hctx_t_ipi = TFRC_INITIAL_IPI;
-
- /* Set nominal send time for initial packet */
- hctx->ccid3hctx_t_nom = now;
- timeval_add_usecs(&hctx->ccid3hctx_t_nom,
- hctx->ccid3hctx_t_ipi);
- ccid3_calc_new_delta(hctx);
- rc = 0;
- break;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
- delay = (timeval_delta(&now, &hctx->ccid3hctx_t_nom) -
- hctx->ccid3hctx_delta);
- delay /= -1000;
- /* divide by -1000 is to convert to ms and get sign right */
- rc = delay > 0 ? delay : 0;
- break;
- default:
- printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
- __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
- dump_stack();
- rc = -EINVAL;
- break;
- }
- /* Can we send? if so add options and add to packet history */
- if (rc == 0) {
- dp->dccps_hc_tx_insert_options = 1;
- new_packet->dccphtx_ccval =
- DCCP_SKB_CB(skb)->dccpd_ccval =
- hctx->ccid3hctx_last_win_count;
- }
-out:
- return rc;
-}
+ } else {
+ delay = ktime_us_delta(hc->tx_t_nom, now);
+ ccid3_pr_debug("delay=%ld\n", (long)delay);
+ /*
+ * Scheduling of packet transmissions (RFC 5348, 8.3)
+ *
+ * if (t_now > t_nom - delta)
+ * // send the packet now
+ * else
+ * // send the packet in (t_nom - t_now) milliseconds.
+ */
+ if (delay >= TFRC_T_DELTA)
+ return (u32)delay / USEC_PER_MSEC;
-static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct timeval now;
+ ccid3_hc_tx_update_win_count(hc, now);
+ }
- BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM);
+ /* prepare to send now (add options etc.) */
+ dp->dccps_hc_tx_insert_options = 1;
+ DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count;
- dccp_timestamp(sk, &now);
+ /* set the nominal send time for the next following packet */
+ hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
+ return CCID_PACKET_SEND_AT_ONCE;
+}
- /* check if we have sent a data packet */
- if (len > 0) {
- unsigned long quarter_rtt;
- struct dccp_tx_hist_entry *packet;
+static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
+{
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- packet = dccp_tx_hist_head(&hctx->ccid3hctx_hist);
- if (unlikely(packet == NULL)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: packet doesn't "
- "exists in history!\n", __FUNCTION__);
- return;
- }
- if (unlikely(packet->dccphtx_sent)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: no unsent packet in "
- "history!\n", __FUNCTION__);
- return;
- }
- packet->dccphtx_tstamp = now;
- packet->dccphtx_seqno = dp->dccps_gss;
- /*
- * Check if win_count have changed
- * Algorithm in "8.1. Window Counter Valuer" in
- * draft-ietf-dccp-ccid3-11.txt
- */
- quarter_rtt = timeval_delta(&now, &hctx->ccid3hctx_t_last_win_count);
- if (likely(hctx->ccid3hctx_rtt > 8))
- quarter_rtt /= hctx->ccid3hctx_rtt / 4;
-
- if (quarter_rtt > 0) {
- hctx->ccid3hctx_t_last_win_count = now;
- hctx->ccid3hctx_last_win_count = (hctx->ccid3hctx_last_win_count +
- min_t(unsigned long, quarter_rtt, 5)) % 16;
- ccid3_pr_debug("%s, sk=%p, window changed from "
- "%u to %u!\n",
- dccp_role(sk), sk,
- packet->dccphtx_ccval,
- hctx->ccid3hctx_last_win_count);
- }
+ ccid3_hc_tx_update_s(hc, len);
- hctx->ccid3hctx_idle = 0;
- packet->dccphtx_rtt = hctx->ccid3hctx_rtt;
- packet->dccphtx_sent = 1;
- } else
- ccid3_pr_debug("%s, sk=%p, seqno=%llu NOT inserted!\n",
- dccp_role(sk), sk, dp->dccps_gss);
-
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_SENT:
- /* if first wasn't pure ack */
- if (len != 0)
- printk(KERN_CRIT "%s: %s, First packet sent is noted "
- "as a data packet\n",
- __FUNCTION__, dccp_role(sk));
- return;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
- if (len > 0) {
- hctx->ccid3hctx_t_nom = now;
- ccid3_calc_new_t_ipi(hctx);
- ccid3_calc_new_delta(hctx);
- timeval_add_usecs(&hctx->ccid3hctx_t_nom,
- hctx->ccid3hctx_t_ipi);
- }
- break;
- default:
- printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
- __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
- dump_stack();
- break;
- }
+ if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
+ DCCP_CRIT("packet history - out of memory!");
}
static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
- struct dccp_tx_hist_entry *packet;
- struct timeval now;
- unsigned long next_tmout;
- u32 t_elapsed;
- u32 pinv;
- u32 x_recv;
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_hist_entry *acked;
+ ktime_t now;
+ unsigned long t_nfb;
u32 r_sample;
- BUG_ON(hctx == NULL || hctx->ccid3hctx_state == TFRC_SSTATE_TERM);
-
/* we are only interested in ACKs */
if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
return;
+ /*
+ * Locate the acknowledged packet in the TX history.
+ *
+ * Returning "entry not found" here can for instance happen when
+ * - the host has not sent out anything (e.g. a passive server),
+ * - the Ack is outdated (packet with higher Ack number was received),
+ * - it is a bogus Ack (for a packet not sent on this connection).
+ */
+ acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
+ if (acked == NULL)
+ return;
+ /* For the sake of RTT sampling, ignore/remove all older entries */
+ tfrc_tx_hist_purge(&acked->next);
- opt_recv = &hctx->ccid3hctx_options_received;
+ /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
+ now = ktime_get_real();
+ r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
+ hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
- t_elapsed = dp->dccps_options_received.dccpor_elapsed_time * 10;
- x_recv = opt_recv->ccid3or_receive_rate;
- pinv = opt_recv->ccid3or_loss_event_rate;
+ /*
+ * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
+ */
+ if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
+ ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
- switch (hctx->ccid3hctx_state) {
- case TFRC_SSTATE_NO_SENT:
- /* FIXME: what to do here? */
- return;
- case TFRC_SSTATE_NO_FBACK:
- case TFRC_SSTATE_FBACK:
- /* Calculate new round trip sample by
- * R_sample = (now - t_recvdata) - t_delay */
- /* get t_recvdata from history */
- packet = dccp_tx_hist_find_entry(&hctx->ccid3hctx_hist,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- if (unlikely(packet == NULL)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, seqno "
- "%llu(%s) does't exist in history!\n",
- __FUNCTION__, dccp_role(sk), sk,
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type));
- return;
- }
+ if (hc->tx_t_rto == 0) {
+ /*
+ * Initial feedback packet: Larger Initial Windows (4.2)
+ */
+ hc->tx_x = rfc3390_initial_rate(sk);
+ hc->tx_t_ld = now;
- /* Update RTT */
- dccp_timestamp(sk, &now);
- r_sample = timeval_delta(&now, &packet->dccphtx_tstamp);
- if (unlikely(r_sample <= t_elapsed))
- LIMIT_NETDEBUG(KERN_WARNING "%s: r_sample=%uus, "
- "t_elapsed=%uus\n",
- __FUNCTION__, r_sample, t_elapsed);
- else
- r_sample -= t_elapsed;
+ ccid3_update_send_interval(hc);
- /* Update RTT estimate by
- * If (No feedback recv)
- * R = R_sample;
- * Else
- * R = q * R + (1 - q) * R_sample;
- *
- * q is a constant, RFC 3448 recomments 0.9
- */
- if (hctx->ccid3hctx_state == TFRC_SSTATE_NO_FBACK) {
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
- hctx->ccid3hctx_rtt = r_sample;
- } else
- hctx->ccid3hctx_rtt = (hctx->ccid3hctx_rtt * 9) / 10 +
- r_sample / 10;
-
- ccid3_pr_debug("%s, sk=%p, New RTT estimate=%uus, "
- "r_sample=%us\n", dccp_role(sk), sk,
- hctx->ccid3hctx_rtt, r_sample);
-
- /* Update timeout interval */
- hctx->ccid3hctx_t_rto = max_t(u32, 4 * hctx->ccid3hctx_rtt,
- USEC_PER_SEC);
-
- /* Update receive rate */
- hctx->ccid3hctx_x_recv = x_recv;/* X_recv in bytes per sec */
-
- /* Update loss event rate */
- if (pinv == ~0 || pinv == 0)
- hctx->ccid3hctx_p = 0;
- else {
- hctx->ccid3hctx_p = 1000000 / pinv;
-
- if (hctx->ccid3hctx_p < TFRC_SMALLEST_P) {
- hctx->ccid3hctx_p = TFRC_SMALLEST_P;
- ccid3_pr_debug("%s, sk=%p, Smallest p used!\n",
- dccp_role(sk), sk);
- }
+ goto done_computing_x;
+ } else if (hc->tx_p == 0) {
+ /*
+ * First feedback after nofeedback timer expiry (4.3)
+ */
+ goto done_computing_x;
}
+ }
- /* unschedule no feedback timer */
- sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
-
- /* Update sending rate */
- ccid3_hc_tx_update_x(sk);
+ /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
+ if (hc->tx_p > 0)
+ hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
+ ccid3_hc_tx_update_x(sk, &now);
- /* Update next send time */
- timeval_sub_usecs(&hctx->ccid3hctx_t_nom,
- hctx->ccid3hctx_t_ipi);
- ccid3_calc_new_t_ipi(hctx);
- timeval_add_usecs(&hctx->ccid3hctx_t_nom,
- hctx->ccid3hctx_t_ipi);
- ccid3_calc_new_delta(hctx);
+done_computing_x:
+ ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
+ "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
+ dccp_role(sk), sk, hc->tx_rtt, r_sample,
+ hc->tx_s, hc->tx_p, hc->tx_x_calc,
+ (unsigned int)(hc->tx_x_recv >> 6),
+ (unsigned int)(hc->tx_x >> 6));
- /* remove all packets older than the one acked from history */
- dccp_tx_hist_purge_older(ccid3_tx_hist,
- &hctx->ccid3hctx_hist, packet);
- /*
- * As we have calculated new ipi, delta, t_nom it is possible that
- * we now can send a packet, so wake up dccp_wait_for_ccids.
- */
- sk->sk_write_space(sk);
+ /* unschedule no feedback timer */
+ sk_stop_timer(sk, &hc->tx_no_feedback_timer);
- /*
- * Schedule no feedback timer to expire in
- * max(4 * R, 2 * s / X)
- */
- next_tmout = max(hctx->ccid3hctx_t_rto,
- 2 * usecs_div(hctx->ccid3hctx_s,
- hctx->ccid3hctx_x));
-
- ccid3_pr_debug("%s, sk=%p, Scheduled no feedback timer to "
- "expire in %lu jiffies (%luus)\n",
- dccp_role(sk), sk,
- usecs_to_jiffies(next_tmout), next_tmout);
-
- sk_reset_timer(sk, &hctx->ccid3hctx_no_feedback_timer,
- jiffies + max_t(u32, 1, usecs_to_jiffies(next_tmout)));
-
- /* set idle flag */
- hctx->ccid3hctx_idle = 1;
- break;
- default:
- printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
- __FUNCTION__, dccp_role(sk), sk, hctx->ccid3hctx_state);
- dump_stack();
- break;
- }
-}
-
-static void ccid3_hc_tx_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ /*
+ * As we have calculated new ipi, delta, t_nom it is possible
+ * that we now can send a packet, so wake up dccp_wait_for_ccid
+ */
+ sk->sk_write_space(sk);
- BUG_ON(hctx == NULL);
+ /*
+ * Update timeout interval for the nofeedback timer. In order to control
+ * rate halving on networks with very low RTTs (<= 1 ms), use per-route
+ * tunable RTAX_RTO_MIN value as the lower bound.
+ */
+ hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
+ USEC_PER_SEC/HZ * tcp_rto_min(sk));
+ /*
+ * Schedule no feedback timer to expire in
+ * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
+ */
+ t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
- if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
- return;
+ ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
+ "expire in %lu jiffies (%luus)\n",
+ dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
- DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count;
+ sk_reset_timer(sk, &hc->tx_no_feedback_timer,
+ jiffies + usecs_to_jiffies(t_nfb));
}
-static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option,
- unsigned char len, u16 idx,
- unsigned char *value)
+static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
+ u8 option, u8 *optval, u8 optlen)
{
- int rc = 0;
- const struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- struct ccid3_options_received *opt_recv;
-
- BUG_ON(hctx == NULL);
-
- opt_recv = &hctx->ccid3hctx_options_received;
-
- if (opt_recv->ccid3or_seqno != dp->dccps_gsr) {
- opt_recv->ccid3or_seqno = dp->dccps_gsr;
- opt_recv->ccid3or_loss_event_rate = ~0;
- opt_recv->ccid3or_loss_intervals_idx = 0;
- opt_recv->ccid3or_loss_intervals_len = 0;
- opt_recv->ccid3or_receive_rate = 0;
- }
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ __be32 opt_val;
switch (option) {
+ case TFRC_OPT_RECEIVE_RATE:
case TFRC_OPT_LOSS_EVENT_RATE:
- if (unlikely(len != 4)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, invalid "
- "len for TFRC_OPT_LOSS_EVENT_RATE\n",
- __FUNCTION__, dccp_role(sk), sk);
- rc = -EINVAL;
- } else {
- opt_recv->ccid3or_loss_event_rate = ntohl(*(u32 *)value);
- ccid3_pr_debug("%s, sk=%p, LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_event_rate);
+ /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
+ if (packet_type == DCCP_PKT_DATA)
+ break;
+ if (unlikely(optlen != 4)) {
+ DCCP_WARN("%s(%p), invalid len %d for %u\n",
+ dccp_role(sk), sk, optlen, option);
+ return -EINVAL;
}
- break;
- case TFRC_OPT_LOSS_INTERVALS:
- opt_recv->ccid3or_loss_intervals_idx = idx;
- opt_recv->ccid3or_loss_intervals_len = len;
- ccid3_pr_debug("%s, sk=%p, LOSS_INTERVALS=(%u, %u)\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_loss_intervals_idx,
- opt_recv->ccid3or_loss_intervals_len);
- break;
- case TFRC_OPT_RECEIVE_RATE:
- if (unlikely(len != 4)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, invalid "
- "len for TFRC_OPT_RECEIVE_RATE\n",
- __FUNCTION__, dccp_role(sk), sk);
- rc = -EINVAL;
+ opt_val = ntohl(get_unaligned((__be32 *)optval));
+
+ if (option == TFRC_OPT_RECEIVE_RATE) {
+ /* Receive Rate is kept in units of 64 bytes/second */
+ hc->tx_x_recv = opt_val;
+ hc->tx_x_recv <<= 6;
+
+ ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
} else {
- opt_recv->ccid3or_receive_rate = ntohl(*(u32 *)value);
- ccid3_pr_debug("%s, sk=%p, RECEIVE_RATE=%u\n",
- dccp_role(sk), sk,
- opt_recv->ccid3or_receive_rate);
+ /* Update the fixpoint Loss Event Rate fraction */
+ hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
+
+ ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
+ dccp_role(sk), sk, opt_val);
}
- break;
}
-
- return rc;
+ return 0;
}
-static int ccid3_hc_tx_init(struct sock *sk)
+static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx;
-
- dp->dccps_hc_tx_ccid_private = kmalloc(sizeof(*hctx), gfp_any());
- if (dp->dccps_hc_tx_ccid_private == NULL)
- return -ENOMEM;
-
- hctx = ccid3_hc_tx_sk(sk);
- memset(hctx, 0, sizeof(*hctx));
-
- if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
- dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
- hctx->ccid3hctx_s = dp->dccps_packet_size;
- else
- hctx->ccid3hctx_s = TFRC_STD_PACKET_SIZE;
-
- /* Set transmission rate to 1 packet per second */
- hctx->ccid3hctx_x = hctx->ccid3hctx_s;
- hctx->ccid3hctx_t_rto = USEC_PER_SEC;
- hctx->ccid3hctx_state = TFRC_SSTATE_NO_SENT;
- INIT_LIST_HEAD(&hctx->ccid3hctx_hist);
- init_timer(&hctx->ccid3hctx_no_feedback_timer);
+ struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
+ hc->tx_state = TFRC_SSTATE_NO_SENT;
+ hc->tx_hist = NULL;
+ setup_timer(&hc->tx_no_feedback_timer,
+ ccid3_hc_tx_no_feedback_timer, (unsigned long)sk);
return 0;
}
static void ccid3_hc_tx_exit(struct sock *sk)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
+ struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- BUG_ON(hctx == NULL);
+ sk_stop_timer(sk, &hc->tx_no_feedback_timer);
+ tfrc_tx_hist_purge(&hc->tx_hist);
+}
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM);
- sk_stop_timer(sk, &hctx->ccid3hctx_no_feedback_timer);
+static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
+{
+ info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
+ info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
+}
+
+static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
+ u32 __user *optval, int __user *optlen)
+{
+ const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
+ struct tfrc_tx_info tfrc;
+ const void *val;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_CCID_TX_INFO:
+ if (len < sizeof(tfrc))
+ return -EINVAL;
+ memset(&tfrc, 0, sizeof(tfrc));
+ tfrc.tfrctx_x = hc->tx_x;
+ tfrc.tfrctx_x_recv = hc->tx_x_recv;
+ tfrc.tfrctx_x_calc = hc->tx_x_calc;
+ tfrc.tfrctx_rtt = hc->tx_rtt;
+ tfrc.tfrctx_p = hc->tx_p;
+ tfrc.tfrctx_rto = hc->tx_t_rto;
+ tfrc.tfrctx_ipi = hc->tx_t_ipi;
+ len = sizeof(tfrc);
+ val = &tfrc;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
- /* Empty packet history */
- dccp_tx_hist_purge(ccid3_tx_hist, &hctx->ccid3hctx_hist);
+ if (put_user(len, optlen) || copy_to_user(optval, val, len))
+ return -EFAULT;
- kfree(dp->dccps_hc_tx_ccid_private);
- dp->dccps_hc_tx_ccid_private = NULL;
+ return 0;
}
/*
- * RX Half Connection methods
+ * Receiver Half-Connection Routines
*/
-/* TFRC receiver states */
-enum ccid3_hc_rx_states {
- TFRC_RSTATE_NO_DATA = 1,
- TFRC_RSTATE_DATA,
- TFRC_RSTATE_TERM = 127,
+/* CCID3 feedback types */
+enum ccid3_fback_type {
+ CCID3_FBACK_NONE = 0,
+ CCID3_FBACK_INITIAL,
+ CCID3_FBACK_PERIODIC,
+ CCID3_FBACK_PARAM_CHANGE
};
-#ifdef CCID3_DEBUG
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
{
- static char *ccid3_rx_state_names[] = {
+ static const char *const ccid3_rx_state_names[] = {
[TFRC_RSTATE_NO_DATA] = "NO_DATA",
[TFRC_RSTATE_DATA] = "DATA",
- [TFRC_RSTATE_TERM] = "TERM",
};
return ccid3_rx_state_names[state];
}
#endif
-static inline void ccid3_hc_rx_set_state(struct sock *sk,
- enum ccid3_hc_rx_states state)
+static void ccid3_hc_rx_set_state(struct sock *sk,
+ enum ccid3_hc_rx_states state)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- enum ccid3_hc_rx_states oldstate = hcrx->ccid3hcrx_state;
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ enum ccid3_hc_rx_states oldstate = hc->rx_state;
ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
ccid3_rx_state_name(state));
WARN_ON(state == oldstate);
- hcrx->ccid3hcrx_state = state;
+ hc->rx_state = state;
}
-static void ccid3_hc_rx_send_feedback(struct sock *sk)
+static void ccid3_hc_rx_send_feedback(struct sock *sk,
+ const struct sk_buff *skb,
+ enum ccid3_fback_type fbtype)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_rx_hist_entry *packet;
- struct timeval now;
-
- ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
+ ktime_t now = ktime_get_real();
+ s64 delta = 0;
- dccp_timestamp(sk, &now);
-
- switch (hcrx->ccid3hcrx_state) {
- case TFRC_RSTATE_NO_DATA:
- hcrx->ccid3hcrx_x_recv = 0;
+ switch (fbtype) {
+ case CCID3_FBACK_INITIAL:
+ hc->rx_x_recv = 0;
+ hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */
break;
- case TFRC_RSTATE_DATA: {
- const u32 delta = timeval_delta(&now,
- &hcrx->ccid3hcrx_tstamp_last_feedback);
- hcrx->ccid3hcrx_x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv,
- delta);
- }
+ case CCID3_FBACK_PARAM_CHANGE:
+ /*
+ * When parameters change (new loss or p > p_prev), we do not
+ * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
+ * need to reuse the previous value of X_recv. However, when
+ * X_recv was 0 (due to early loss), this would kill X down to
+ * s/t_mbi (i.e. one packet in 64 seconds).
+ * To avoid such drastic reduction, we approximate X_recv as
+ * the number of bytes since last feedback.
+ * This is a safe fallback, since X is bounded above by X_calc.
+ */
+ if (hc->rx_x_recv > 0)
+ break;
+ /* fall through */
+ case CCID3_FBACK_PERIODIC:
+ delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
+ if (delta <= 0)
+ DCCP_BUG("delta (%ld) <= 0", (long)delta);
+ else
+ hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
break;
default:
- printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
- __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
- dump_stack();
return;
}
- packet = dccp_rx_hist_find_data_packet(&hcrx->ccid3hcrx_hist);
- if (unlikely(packet == NULL)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, no data packet "
- "in history!\n",
- __FUNCTION__, dccp_role(sk), sk);
- return;
- }
+ ccid3_pr_debug("Interval %ldusec, X_recv=%u, 1/p=%u\n", (long)delta,
+ hc->rx_x_recv, hc->rx_pinv);
- hcrx->ccid3hcrx_tstamp_last_feedback = now;
- hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval;
- hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno;
- hcrx->ccid3hcrx_bytes_recv = 0;
+ hc->rx_tstamp_last_feedback = now;
+ hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval;
+ hc->rx_bytes_recv = 0;
- /* Convert to multiples of 10us */
- hcrx->ccid3hcrx_elapsed_time =
- timeval_delta(&now, &packet->dccphrx_tstamp) / 10;
- if (hcrx->ccid3hcrx_p == 0)
- hcrx->ccid3hcrx_pinv = ~0;
- else
- hcrx->ccid3hcrx_pinv = 1000000 / hcrx->ccid3hcrx_p;
dp->dccps_hc_rx_insert_options = 1;
dccp_send_ack(sk);
}
-static void ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
+static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
{
- const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- u32 x_recv, pinv;
-
- BUG_ON(hcrx == NULL);
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ __be32 x_recv, pinv;
if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
- return;
-
- DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter;
+ return 0;
if (dccp_packet_without_ack(skb))
- return;
-
- if (hcrx->ccid3hcrx_elapsed_time != 0)
- dccp_insert_option_elapsed_time(sk, skb,
- hcrx->ccid3hcrx_elapsed_time);
- dccp_insert_option_timestamp(sk, skb);
- x_recv = htonl(hcrx->ccid3hcrx_x_recv);
- pinv = htonl(hcrx->ccid3hcrx_pinv);
- dccp_insert_option(sk, skb, TFRC_OPT_LOSS_EVENT_RATE,
- &pinv, sizeof(pinv));
- dccp_insert_option(sk, skb, TFRC_OPT_RECEIVE_RATE,
- &x_recv, sizeof(x_recv));
+ return 0;
+
+ x_recv = htonl(hc->rx_x_recv);
+ pinv = htonl(hc->rx_pinv);
+
+ if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
+ &pinv, sizeof(pinv)) ||
+ dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
+ &x_recv, sizeof(x_recv)))
+ return -1;
+
+ return 0;
}
-/* calculate first loss interval
+/**
+ * ccid3_first_li - Implements [RFC 5348, 6.3.1]
*
- * returns estimated loss interval in usecs */
-
-static u32 ccid3_hc_rx_calc_first_li(struct sock *sk)
+ * Determine the length of the first loss interval via inverse lookup.
+ * Assume that X_recv can be computed by the throughput equation
+ * s
+ * X_recv = --------
+ * R * fval
+ * Find some p such that f(p) = fval; return 1/p (scaled).
+ */
+static u32 ccid3_first_li(struct sock *sk)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- struct dccp_rx_hist_entry *entry, *next, *tail = NULL;
- u32 rtt, delta, x_recv, fval, p, tmp2;
- struct timeval tstamp = { 0, };
- int interval = 0;
- int win_count = 0;
- int step = 0;
- u64 tmp1;
-
- list_for_each_entry_safe(entry, next, &hcrx->ccid3hcrx_hist,
- dccphrx_node) {
- if (dccp_rx_hist_entry_data_packet(entry)) {
- tail = entry;
-
- switch (step) {
- case 0:
- tstamp = entry->dccphrx_tstamp;
- win_count = entry->dccphrx_ccval;
- step = 1;
- break;
- case 1:
- interval = win_count - entry->dccphrx_ccval;
- if (interval < 0)
- interval += TFRC_WIN_COUNT_LIMIT;
- if (interval > 4)
- goto found;
- break;
- }
- }
- }
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ u32 x_recv, p, delta;
+ u64 fval;
- if (unlikely(step == 0)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, packet history "
- "contains no data packets!\n",
- __FUNCTION__, dccp_role(sk), sk);
- return ~0;
+ if (hc->rx_rtt == 0) {
+ DCCP_WARN("No RTT estimate available, using fallback RTT\n");
+ hc->rx_rtt = DCCP_FALLBACK_RTT;
}
- if (unlikely(interval == 0)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, Could not find a "
- "win_count interval > 0. Defaulting to 1\n",
- __FUNCTION__, dccp_role(sk), sk);
- interval = 1;
+ delta = ktime_to_us(net_timedelta(hc->rx_tstamp_last_feedback));
+ x_recv = scaled_div32(hc->rx_bytes_recv, delta);
+ if (x_recv == 0) { /* would also trigger divide-by-zero */
+ DCCP_WARN("X_recv==0\n");
+ if (hc->rx_x_recv == 0) {
+ DCCP_BUG("stored value of X_recv is zero");
+ return ~0U;
+ }
+ x_recv = hc->rx_x_recv;
}
-found:
- rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval;
- ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n",
- dccp_role(sk), sk, rtt);
- if (rtt == 0)
- rtt = 1;
-
- dccp_timestamp(sk, &tstamp);
- delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback);
- x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta);
-
- tmp1 = (u64)x_recv * (u64)rtt;
- do_div(tmp1,10000000);
- tmp2 = (u32)tmp1;
- fval = (hcrx->ccid3hcrx_s * 100000) / tmp2;
- /* do not alter order above or you will get overflow on 32 bit */
- p = tfrc_calc_x_reverse_lookup(fval);
- ccid3_pr_debug("%s, sk=%p, receive rate=%u bytes/s, implied "
- "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
- if (p == 0)
- return ~0;
- else
- return 1000000 / p;
-}
-
-static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-
- if (seq_loss != DCCP_MAX_SEQNO + 1 &&
- list_empty(&hcrx->ccid3hcrx_li_hist)) {
- struct dccp_li_hist_entry *li_tail;
-
- li_tail = dccp_li_hist_interval_new(ccid3_li_hist,
- &hcrx->ccid3hcrx_li_hist,
- seq_loss, win_loss);
- if (li_tail == NULL)
- return;
- li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk);
- } else
- LIMIT_NETDEBUG(KERN_WARNING "%s: FIXME: find end of "
- "interval\n", __FUNCTION__);
-}
+ fval = scaled_div(hc->rx_s, hc->rx_rtt);
+ fval = scaled_div32(fval, x_recv);
+ p = tfrc_calc_x_reverse_lookup(fval);
-static void ccid3_hc_rx_detect_loss(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- u8 win_loss;
- const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist,
- &hcrx->ccid3hcrx_li_hist,
- &win_loss);
+ ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
+ "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
- ccid3_hc_rx_update_li(sk, seq_loss, win_loss);
+ return p == 0 ? ~0U : scaled_div(1, p);
}
static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- const struct dccp_options_received *opt_recv;
- struct dccp_rx_hist_entry *packet;
- struct timeval now;
- u8 win_count;
- u32 p_prev, r_sample, t_elapsed;
- int ins;
-
- BUG_ON(hcrx == NULL ||
- !(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA ||
- hcrx->ccid3hcrx_state == TFRC_RSTATE_DATA));
-
- opt_recv = &dccp_sk(sk)->dccps_options_received;
-
- switch (DCCP_SKB_CB(skb)->dccpd_type) {
- case DCCP_PKT_ACK:
- if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
- return;
- case DCCP_PKT_DATAACK:
- if (opt_recv->dccpor_timestamp_echo == 0)
- break;
- p_prev = hcrx->ccid3hcrx_rtt;
- dccp_timestamp(sk, &now);
- timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10);
- r_sample = timeval_usecs(&now);
- t_elapsed = opt_recv->dccpor_elapsed_time * 10;
-
- if (unlikely(r_sample <= t_elapsed))
- LIMIT_NETDEBUG(KERN_WARNING "%s: r_sample=%uus, "
- "t_elapsed=%uus\n",
- __FUNCTION__, r_sample, t_elapsed);
- else
- r_sample -= t_elapsed;
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
+ const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
+ const bool is_data_packet = dccp_data_packet(skb);
+
+ if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
+ if (is_data_packet) {
+ const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+ do_feedback = CCID3_FBACK_INITIAL;
+ ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
+ hc->rx_s = payload;
+ /*
+ * Not necessary to update rx_bytes_recv here,
+ * since X_recv = 0 for the first feedback packet (cf.
+ * RFC 3448, 6.3) -- gerrit
+ */
+ }
+ goto update_records;
+ }
- if (hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA)
- hcrx->ccid3hcrx_rtt = r_sample;
- else
- hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 +
- r_sample / 10;
+ if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
+ return; /* done receiving */
- if (p_prev != hcrx->ccid3hcrx_rtt)
- ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n",
- dccp_role(sk), hcrx->ccid3hcrx_rtt,
- opt_recv->dccpor_elapsed_time);
- break;
- case DCCP_PKT_DATA:
- break;
- default: /* We're not interested in other packet types, move along */
- return;
+ if (is_data_packet) {
+ const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
+ /*
+ * Update moving-average of s and the sum of received payload bytes
+ */
+ hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
+ hc->rx_bytes_recv += payload;
}
- packet = dccp_rx_hist_entry_new(ccid3_rx_hist, sk, opt_recv->dccpor_ndp,
- skb, SLAB_ATOMIC);
- if (unlikely(packet == NULL)) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: %s, sk=%p, Not enough mem to "
- "add rx packet to history, consider it lost!\n",
- __FUNCTION__, dccp_role(sk), sk);
- return;
+ /*
+ * Perform loss detection and handle pending losses
+ */
+ if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
+ skb, ndp, ccid3_first_li, sk)) {
+ do_feedback = CCID3_FBACK_PARAM_CHANGE;
+ goto done_receiving;
}
- win_count = packet->dccphrx_ccval;
+ if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
+ return; /* done receiving */
- ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist,
- &hcrx->ccid3hcrx_li_hist, packet);
-
- if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK)
- return;
+ /*
+ * Handle data packets: RTT sampling and monitoring p
+ */
+ if (unlikely(!is_data_packet))
+ goto update_records;
- switch (hcrx->ccid3hcrx_state) {
- case TFRC_RSTATE_NO_DATA:
- ccid3_pr_debug("%s, sk=%p(%s), skb=%p, sending initial "
- "feedback\n",
- dccp_role(sk), sk,
- dccp_state_name(sk->sk_state), skb);
- ccid3_hc_rx_send_feedback(sk);
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
- return;
- case TFRC_RSTATE_DATA:
- hcrx->ccid3hcrx_bytes_recv += skb->len -
- dccp_hdr(skb)->dccph_doff * 4;
- if (ins != 0)
- break;
+ if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
+ const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
+ /*
+ * Empty loss history: no loss so far, hence p stays 0.
+ * Sample RTT values, since an RTT estimate is required for the
+ * computation of p when the first loss occurs; RFC 3448, 6.3.1.
+ */
+ if (sample != 0)
+ hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
- dccp_timestamp(sk, &now);
- if (timeval_delta(&now, &hcrx->ccid3hcrx_tstamp_last_ack) >=
- hcrx->ccid3hcrx_rtt) {
- hcrx->ccid3hcrx_tstamp_last_ack = now;
- ccid3_hc_rx_send_feedback(sk);
- }
- return;
- default:
- printk(KERN_CRIT "%s: %s, sk=%p, Illegal state (%d)!\n",
- __FUNCTION__, dccp_role(sk), sk, hcrx->ccid3hcrx_state);
- dump_stack();
- return;
+ } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
+ /*
+ * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
+ * has decreased (resp. p has increased), send feedback now.
+ */
+ do_feedback = CCID3_FBACK_PARAM_CHANGE;
}
- /* Dealing with packet loss */
- ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n",
- dccp_role(sk), sk, dccp_state_name(sk->sk_state));
+ /*
+ * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
+ */
+ if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
+ do_feedback = CCID3_FBACK_PERIODIC;
- ccid3_hc_rx_detect_loss(sk);
- p_prev = hcrx->ccid3hcrx_p;
-
- /* Calculate loss event rate */
- if (!list_empty(&hcrx->ccid3hcrx_li_hist))
- /* Scaling up by 1000000 as fixed decimal */
- hcrx->ccid3hcrx_p = 1000000 / dccp_li_hist_calc_i_mean(&hcrx->ccid3hcrx_li_hist);
+update_records:
+ tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
- if (hcrx->ccid3hcrx_p > p_prev) {
- ccid3_hc_rx_send_feedback(sk);
- return;
- }
+done_receiving:
+ if (do_feedback)
+ ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
}
-static int ccid3_hc_rx_init(struct sock *sk)
+static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_rx_sock *hcrx;
-
- ccid3_pr_debug("%s, sk=%p\n", dccp_role(sk), sk);
-
- dp->dccps_hc_rx_ccid_private = kmalloc(sizeof(*hcrx), gfp_any());
- if (dp->dccps_hc_rx_ccid_private == NULL)
- return -ENOMEM;
+ struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
- hcrx = ccid3_hc_rx_sk(sk);
- memset(hcrx, 0, sizeof(*hcrx));
-
- if (dp->dccps_packet_size >= TFRC_MIN_PACKET_SIZE &&
- dp->dccps_packet_size <= TFRC_MAX_PACKET_SIZE)
- hcrx->ccid3hcrx_s = dp->dccps_packet_size;
- else
- hcrx->ccid3hcrx_s = TFRC_STD_PACKET_SIZE;
-
- hcrx->ccid3hcrx_state = TFRC_RSTATE_NO_DATA;
- INIT_LIST_HEAD(&hcrx->ccid3hcrx_hist);
- INIT_LIST_HEAD(&hcrx->ccid3hcrx_li_hist);
- dccp_timestamp(sk, &hcrx->ccid3hcrx_tstamp_last_ack);
- hcrx->ccid3hcrx_tstamp_last_feedback = hcrx->ccid3hcrx_tstamp_last_ack;
- hcrx->ccid3hcrx_rtt = 5000; /* XXX 5ms for now... */
- return 0;
+ hc->rx_state = TFRC_RSTATE_NO_DATA;
+ tfrc_lh_init(&hc->rx_li_hist);
+ return tfrc_rx_hist_alloc(&hc->rx_hist);
}
static void ccid3_hc_rx_exit(struct sock *sk)
{
- struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
-
- BUG_ON(hcrx == NULL);
-
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM);
-
- /* Empty packet history */
- dccp_rx_hist_purge(ccid3_rx_hist, &hcrx->ccid3hcrx_hist);
+ struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- /* Empty loss interval history */
- dccp_li_hist_purge(ccid3_li_hist, &hcrx->ccid3hcrx_li_hist);
-
- kfree(dp->dccps_hc_rx_ccid_private);
- dp->dccps_hc_rx_ccid_private = NULL;
+ tfrc_rx_hist_purge(&hc->rx_hist);
+ tfrc_lh_cleanup(&hc->rx_li_hist);
}
static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
{
- const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- BUG_ON(hcrx == NULL);
-
- info->tcpi_ca_state = hcrx->ccid3hcrx_state;
- info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = hcrx->ccid3hcrx_rtt;
-}
-
-static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
-{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- BUG_ON(hctx == NULL);
-
- info->tcpi_rto = hctx->ccid3hctx_t_rto;
- info->tcpi_rtt = hctx->ccid3hctx_rtt;
+ info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
+ info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+ info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
}
static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
u32 __user *optval, int __user *optlen)
{
- const struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk);
+ const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
+ struct tfrc_rx_info rx_info;
const void *val;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
switch (optname) {
case DCCP_SOCKOPT_CCID_RX_INFO:
- if (len < sizeof(hcrx->ccid3hcrx_tfrc))
+ if (len < sizeof(rx_info))
return -EINVAL;
- len = sizeof(hcrx->ccid3hcrx_tfrc);
- val = &hcrx->ccid3hcrx_tfrc;
+ rx_info.tfrcrx_x_recv = hc->rx_x_recv;
+ rx_info.tfrcrx_rtt = hc->rx_rtt;
+ rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
+ len = sizeof(rx_info);
+ val = &rx_info;
break;
default:
return -ENOPROTOOPT;
@@ -1147,46 +843,17 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
return 0;
}
-static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_tx_sock *hctx = ccid3_hc_tx_sk(sk);
- const void *val;
-
- /* Listen socks doesn't have a private CCID block */
- if (sk->sk_state == DCCP_LISTEN)
- return -EINVAL;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(hctx->ccid3hctx_tfrc))
- return -EINVAL;
- len = sizeof(hctx->ccid3hctx_tfrc);
- val = &hctx->ccid3hctx_tfrc;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-static struct ccid ccid3 = {
- .ccid_id = 3,
- .ccid_name = "ccid3",
- .ccid_owner = THIS_MODULE,
- .ccid_init = ccid3_init,
- .ccid_exit = ccid3_exit,
+struct ccid_operations ccid3_ops = {
+ .ccid_id = DCCPC_CCID3,
+ .ccid_name = "TCP-Friendly Rate Control",
+ .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
.ccid_hc_tx_init = ccid3_hc_tx_init,
.ccid_hc_tx_exit = ccid3_hc_tx_exit,
.ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
.ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
.ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
- .ccid_hc_tx_insert_options = ccid3_hc_tx_insert_options,
.ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
+ .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
.ccid_hc_rx_init = ccid3_hc_rx_init,
.ccid_hc_rx_exit = ccid3_hc_rx_exit,
.ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
@@ -1196,75 +863,8 @@ static struct ccid ccid3 = {
.ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
.ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
};
-
-module_param(ccid3_debug, int, 0444);
-MODULE_PARM_DESC(ccid3_debug, "Enable debug messages");
-
-static __init int ccid3_module_init(void)
-{
- int rc = -ENOBUFS;
-
- ccid3_rx_hist = dccp_rx_hist_new("ccid3");
- if (ccid3_rx_hist == NULL)
- goto out;
-
- ccid3_tx_hist = dccp_tx_hist_new("ccid3");
- if (ccid3_tx_hist == NULL)
- goto out_free_rx;
-
- ccid3_li_hist = dccp_li_hist_new("ccid3");
- if (ccid3_li_hist == NULL)
- goto out_free_tx;
-
- rc = ccid_register(&ccid3);
- if (rc != 0)
- goto out_free_loss_interval_history;
-out:
- return rc;
-
-out_free_loss_interval_history:
- dccp_li_hist_delete(ccid3_li_hist);
- ccid3_li_hist = NULL;
-out_free_tx:
- dccp_tx_hist_delete(ccid3_tx_hist);
- ccid3_tx_hist = NULL;
-out_free_rx:
- dccp_rx_hist_delete(ccid3_rx_hist);
- ccid3_rx_hist = NULL;
- goto out;
-}
-module_init(ccid3_module_init);
-static __exit void ccid3_module_exit(void)
-{
-#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
- /*
- * Hack to use while developing, so that we get rid of the control
- * sock, that is what keeps a refcount on dccp.ko -acme
- */
- extern void dccp_ctl_sock_exit(void);
-
- dccp_ctl_sock_exit();
+#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
+module_param(ccid3_debug, bool, 0644);
+MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
#endif
- ccid_unregister(&ccid3);
-
- if (ccid3_tx_hist != NULL) {
- dccp_tx_hist_delete(ccid3_tx_hist);
- ccid3_tx_hist = NULL;
- }
- if (ccid3_rx_hist != NULL) {
- dccp_rx_hist_delete(ccid3_rx_hist);
- ccid3_rx_hist = NULL;
- }
- if (ccid3_li_hist != NULL) {
- dccp_li_hist_delete(ccid3_li_hist);
- ccid3_li_hist = NULL;
- }
-}
-module_exit(ccid3_module_exit);
-
-MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
- "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
-MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("net-dccp-ccid-3");
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
index 0bde4583d09..1a9933c2967 100644
--- a/net/dccp/ccids/ccid3.h
+++ b/net/dccp/ccids/ccid3.h
@@ -1,13 +1,12 @@
/*
- * net/dccp/ccids/ccid3.h
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
*
* An implementation of the DCCP protocol
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
@@ -36,28 +35,31 @@
#ifndef _DCCP_CCID3_H_
#define _DCCP_CCID3_H_
-#include <linux/config.h>
+#include <linux/ktime.h>
#include <linux/list.h>
-#include <linux/time.h>
#include <linux/types.h>
#include <linux/tfrc.h>
+#include "lib/tfrc.h"
+#include "../ccid.h"
-#define TFRC_MIN_PACKET_SIZE 16
-#define TFRC_STD_PACKET_SIZE 256
-#define TFRC_MAX_PACKET_SIZE 65535
-
-/* Two seconds as per CCID3 spec */
+/* Two seconds as per RFC 5348, 4.2 */
#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-#define TFRC_INITIAL_IPI (USEC_PER_SEC / 4)
-
-/* In usecs - half the scheduling granularity as per RFC3448 4.6 */
-#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ))
+/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
+#define TFRC_T_MBI 64
-/* In seconds */
-#define TFRC_MAX_BACK_OFF_TIME 64
-
-#define TFRC_SMALLEST_P 40
+/*
+ * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
+ * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
+ * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
+ * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
+ * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
+ */
+#if (HZ >= 500)
+# define TFRC_T_DELTA USEC_PER_MSEC
+#else
+# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
+#endif
enum ccid3_options {
TFRC_OPT_LOSS_EVENT_RATE = 192,
@@ -65,82 +67,94 @@ enum ccid3_options {
TFRC_OPT_RECEIVE_RATE = 194,
};
-struct ccid3_options_received {
- u64 ccid3or_seqno:48,
- ccid3or_loss_intervals_idx:16;
- u16 ccid3or_loss_intervals_len;
- u32 ccid3or_loss_event_rate;
- u32 ccid3or_receive_rate;
+/* TFRC sender states */
+enum ccid3_hc_tx_states {
+ TFRC_SSTATE_NO_SENT = 1,
+ TFRC_SSTATE_NO_FBACK,
+ TFRC_SSTATE_FBACK,
};
-/** struct ccid3_hc_tx_sock - CCID3 sender half connection sock
- *
- * @ccid3hctx_state - Sender state
- * @ccid3hctx_x - Current sending rate
- * @ccid3hctx_x_recv - Receive rate
- * @ccid3hctx_x_calc - Calculated send (?) rate
- * @ccid3hctx_s - Packet size
- * @ccid3hctx_rtt - Estimate of current round trip time in usecs
- * @@ccid3hctx_p - Current loss event rate (0-1) scaled by 1000000
- * @ccid3hctx_last_win_count - Last window counter sent
- * @ccid3hctx_t_last_win_count - Timestamp of earliest packet
- * with last_win_count value sent
- * @ccid3hctx_no_feedback_timer - Handle to no feedback timer
- * @ccid3hctx_idle - FIXME
- * @ccid3hctx_t_ld - Time last doubled during slow start
- * @ccid3hctx_t_nom - Nominal send time of next packet
- * @ccid3hctx_t_ipi - Interpacket (send) interval
- * @ccid3hctx_delta - Send timer delta
- * @ccid3hctx_hist - Packet history
- */
+/**
+ * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
+ * @tx_x: Current sending rate in 64 * bytes per second
+ * @tx_x_recv: Receive rate in 64 * bytes per second
+ * @tx_x_calc: Calculated rate in bytes per second
+ * @tx_rtt: Estimate of current round trip time in usecs
+ * @tx_p: Current loss event rate (0-1) scaled by 1000000
+ * @tx_s: Packet size in bytes
+ * @tx_t_rto: Nofeedback Timer setting in usecs
+ * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs
+ * @tx_state: Sender state, one of %ccid3_hc_tx_states
+ * @tx_last_win_count: Last window counter sent
+ * @tx_t_last_win_count: Timestamp of earliest packet
+ * with last_win_count value sent
+ * @tx_no_feedback_timer: Handle to no feedback timer
+ * @tx_t_ld: Time last doubled during slow start
+ * @tx_t_nom: Nominal send time of next packet
+ * @tx_hist: Packet history
+ */
struct ccid3_hc_tx_sock {
- struct tfrc_tx_info ccid3hctx_tfrc;
-#define ccid3hctx_x ccid3hctx_tfrc.tfrctx_x
-#define ccid3hctx_x_recv ccid3hctx_tfrc.tfrctx_x_recv
-#define ccid3hctx_x_calc ccid3hctx_tfrc.tfrctx_x_calc
-#define ccid3hctx_rtt ccid3hctx_tfrc.tfrctx_rtt
-#define ccid3hctx_p ccid3hctx_tfrc.tfrctx_p
-#define ccid3hctx_t_rto ccid3hctx_tfrc.tfrctx_rto
-#define ccid3hctx_t_ipi ccid3hctx_tfrc.tfrctx_ipi
- u16 ccid3hctx_s;
- u8 ccid3hctx_state;
- u8 ccid3hctx_last_win_count;
- u8 ccid3hctx_idle;
- struct timeval ccid3hctx_t_last_win_count;
- struct timer_list ccid3hctx_no_feedback_timer;
- struct timeval ccid3hctx_t_ld;
- struct timeval ccid3hctx_t_nom;
- u32 ccid3hctx_delta;
- struct list_head ccid3hctx_hist;
- struct ccid3_options_received ccid3hctx_options_received;
-};
-
-struct ccid3_hc_rx_sock {
- struct tfrc_rx_info ccid3hcrx_tfrc;
-#define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv
-#define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt
-#define ccid3hcrx_p ccid3hcrx_tfrc.tfrcrx_p
- u64 ccid3hcrx_seqno_last_counter:48,
- ccid3hcrx_state:8,
- ccid3hcrx_last_counter:4;
- u32 ccid3hcrx_bytes_recv;
- struct timeval ccid3hcrx_tstamp_last_feedback;
- struct timeval ccid3hcrx_tstamp_last_ack;
- struct list_head ccid3hcrx_hist;
- struct list_head ccid3hcrx_li_hist;
- u16 ccid3hcrx_s;
- u32 ccid3hcrx_pinv;
- u32 ccid3hcrx_elapsed_time;
+ u64 tx_x;
+ u64 tx_x_recv;
+ u32 tx_x_calc;
+ u32 tx_rtt;
+ u32 tx_p;
+ u32 tx_t_rto;
+ u32 tx_t_ipi;
+ u16 tx_s;
+ enum ccid3_hc_tx_states tx_state:8;
+ u8 tx_last_win_count;
+ ktime_t tx_t_last_win_count;
+ struct timer_list tx_no_feedback_timer;
+ ktime_t tx_t_ld;
+ ktime_t tx_t_nom;
+ struct tfrc_tx_hist_entry *tx_hist;
};
static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
{
- return dccp_sk(sk)->dccps_hc_tx_ccid_private;
+ struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
+ BUG_ON(hctx == NULL);
+ return hctx;
}
+/* TFRC receiver states */
+enum ccid3_hc_rx_states {
+ TFRC_RSTATE_NO_DATA = 1,
+ TFRC_RSTATE_DATA,
+};
+
+/**
+ * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
+ * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
+ * @rx_state: Receiver state, one of %ccid3_hc_rx_states
+ * @rx_bytes_recv: Total sum of DCCP payload bytes
+ * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
+ * @rx_rtt: Receiver estimate of RTT
+ * @rx_tstamp_last_feedback: Time at which last feedback was sent
+ * @rx_hist: Packet history (loss detection + RTT sampling)
+ * @rx_li_hist: Loss Interval database
+ * @rx_s: Received packet size in bytes
+ * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
+ */
+struct ccid3_hc_rx_sock {
+ u8 rx_last_counter:4;
+ enum ccid3_hc_rx_states rx_state:8;
+ u32 rx_bytes_recv;
+ u32 rx_x_recv;
+ u32 rx_rtt;
+ ktime_t rx_tstamp_last_feedback;
+ struct tfrc_rx_hist rx_hist;
+ struct tfrc_loss_hist rx_li_hist;
+ u16 rx_s;
+#define rx_pinv rx_li_hist.i_mean
+};
+
static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
{
- return dccp_sk(sk)->dccps_hc_rx_ccid_private;
+ struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
+ BUG_ON(hcrx == NULL);
+ return hcrx;
}
#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile
deleted file mode 100644
index 5f940a6cbac..00000000000
--- a/net/dccp/ccids/lib/Makefile
+++ /dev/null
@@ -1,3 +0,0 @@
-obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o
-
-dccp_tfrc_lib-y := loss_interval.o packet_history.o tfrc_equation.o
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 4c01a54143a..57f9fd78c4d 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -1,8 +1,7 @@
/*
- * net/dccp/ccids/lib/loss_interval.c
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify
@@ -10,135 +9,177 @@
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
+#include <net/sock.h>
+#include "tfrc.h"
-#include <linux/config.h>
-#include <linux/module.h>
-
-#include "loss_interval.h"
+static struct kmem_cache *tfrc_lh_slab __read_mostly;
+/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
+static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
-struct dccp_li_hist *dccp_li_hist_new(const char *name)
+/* implements LIFO semantics on the array */
+static inline u8 LIH_INDEX(const u8 ctr)
{
- struct dccp_li_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
- static const char dccp_li_hist_mask[] = "li_hist_%s";
- char *slab_name;
-
- if (hist == NULL)
- goto out;
-
- slab_name = kmalloc(strlen(name) + sizeof(dccp_li_hist_mask) - 1,
- GFP_ATOMIC);
- if (slab_name == NULL)
- goto out_free_hist;
-
- sprintf(slab_name, dccp_li_hist_mask, name);
- hist->dccplih_slab = kmem_cache_create(slab_name,
- sizeof(struct dccp_li_hist_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (hist->dccplih_slab == NULL)
- goto out_free_slab_name;
-out:
- return hist;
-out_free_slab_name:
- kfree(slab_name);
-out_free_hist:
- kfree(hist);
- hist = NULL;
- goto out;
+ return LIH_SIZE - 1 - (ctr % LIH_SIZE);
}
-EXPORT_SYMBOL_GPL(dccp_li_hist_new);
+/* the `counter' index always points at the next entry to be populated */
+static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
+{
+ return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
+}
-void dccp_li_hist_delete(struct dccp_li_hist *hist)
+/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
+static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
{
- const char* name = kmem_cache_name(hist->dccplih_slab);
+ BUG_ON(i >= lh->counter);
+ return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
+}
- kmem_cache_destroy(hist->dccplih_slab);
- kfree(name);
- kfree(hist);
+/*
+ * On-demand allocation and de-allocation of entries
+ */
+static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
+{
+ if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
+ lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
+ GFP_ATOMIC);
+ return lh->ring[LIH_INDEX(lh->counter)];
}
-EXPORT_SYMBOL_GPL(dccp_li_hist_delete);
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
+{
+ if (!tfrc_lh_is_initialised(lh))
+ return;
+
+ for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
+ if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
+ kmem_cache_free(tfrc_lh_slab,
+ lh->ring[LIH_INDEX(lh->counter)]);
+ lh->ring[LIH_INDEX(lh->counter)] = NULL;
+ }
+}
-void dccp_li_hist_purge(struct dccp_li_hist *hist, struct list_head *list)
+static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
{
- struct dccp_li_hist_entry *entry, *next;
+ u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
+ int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
+
+ if (k <= 0)
+ return;
+
+ for (i = 0; i <= k; i++) {
+ i_i = tfrc_lh_get_interval(lh, i);
- list_for_each_entry_safe(entry, next, list, dccplih_node) {
- list_del_init(&entry->dccplih_node);
- kmem_cache_free(hist->dccplih_slab, entry);
+ if (i < k) {
+ i_tot0 += i_i * tfrc_lh_weights[i];
+ w_tot += tfrc_lh_weights[i];
+ }
+ if (i > 0)
+ i_tot1 += i_i * tfrc_lh_weights[i-1];
}
-}
-EXPORT_SYMBOL_GPL(dccp_li_hist_purge);
+ lh->i_mean = max(i_tot0, i_tot1) / w_tot;
+}
-/* Weights used to calculate loss event rate */
-/*
- * These are integers as per section 8 of RFC3448. We can then divide by 4 *
- * when we use it.
+/**
+ * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
+ * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
*/
-static const int dccp_li_hist_w[DCCP_LI_HIST_IVAL_F_LENGTH] = {
- 4, 4, 4, 4, 3, 2, 1, 1,
-};
-
-u32 dccp_li_hist_calc_i_mean(struct list_head *list)
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
{
- struct dccp_li_hist_entry *li_entry, *li_next;
- int i = 0;
- u32 i_tot;
- u32 i_tot0 = 0;
- u32 i_tot1 = 0;
- u32 w_tot = 0;
-
- list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) {
- if (i < DCCP_LI_HIST_IVAL_F_LENGTH) {
- i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i];
- w_tot += dccp_li_hist_w[i];
- }
+ struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
+ u32 old_i_mean = lh->i_mean;
+ s64 len;
- if (i != 0)
- i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1];
+ if (cur == NULL) /* not initialised */
+ return 0;
- if (++i > DCCP_LI_HIST_IVAL_F_LENGTH)
- break;
- }
+ len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
- if (i != DCCP_LI_HIST_IVAL_F_LENGTH)
+ if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
return 0;
- i_tot = max(i_tot0, i_tot1);
+ if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
+ /*
+ * Implements RFC 4342, 10.2:
+ * If a packet S (skb) exists whose seqno comes `after' the one
+ * starting the current loss interval (cur) and if the modulo-16
+ * distance from C(cur) to C(S) is greater than 4, consider all
+ * subsequent packets as belonging to a new loss interval. This
+ * test is necessary since CCVal may wrap between intervals.
+ */
+ cur->li_is_closed = 1;
+
+ if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
+ return 0;
- /* FIXME: Why do we do this? -Ian McDonald */
- if (i_tot * 4 < w_tot)
- i_tot = w_tot * 4;
+ cur->li_length = len;
+ tfrc_lh_calc_i_mean(lh);
- return i_tot * 4 / w_tot;
+ return lh->i_mean < old_i_mean;
}
-EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean);
+/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
+static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
+ struct tfrc_rx_hist_entry *new_loss)
+{
+ return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
+ (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
+}
-struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist,
- struct list_head *list,
- const u64 seq_loss,
- const u8 win_loss)
+/**
+ * tfrc_lh_interval_add - Insert new record into the Loss Interval database
+ * @lh: Loss Interval database
+ * @rh: Receive history containing a fresh loss event
+ * @calc_first_li: Caller-dependent routine to compute length of first interval
+ * @sk: Used by @calc_first_li in caller-specific way (subtyping)
+ *
+ * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
+ */
+int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
+ u32 (*calc_first_li)(struct sock *), struct sock *sk)
{
- struct dccp_li_hist_entry *tail = NULL, *entry;
- int i;
-
- for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) {
- entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC);
- if (entry == NULL) {
- dccp_li_hist_purge(hist, list);
- return NULL;
- }
- if (tail == NULL)
- tail = entry;
- list_add(&entry->dccplih_node, list);
+ struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
+
+ if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
+ return 0;
+
+ new = tfrc_lh_demand_next(lh);
+ if (unlikely(new == NULL)) {
+ DCCP_CRIT("Cannot allocate/add loss record.");
+ return 0;
+ }
+
+ new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
+ new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
+ new->li_is_closed = 0;
+
+ if (++lh->counter == 1)
+ lh->i_mean = new->li_length = (*calc_first_li)(sk);
+ else {
+ cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
+ new->li_length = dccp_delta_seqno(new->li_seqno,
+ tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
+ if (lh->counter > (2*LIH_SIZE))
+ lh->counter -= LIH_SIZE;
+
+ tfrc_lh_calc_i_mean(lh);
}
+ return 1;
+}
- entry->dccplih_seqno = seq_loss;
- entry->dccplih_win_count = win_loss;
- return tail;
+int __init tfrc_li_init(void)
+{
+ tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
+ sizeof(struct tfrc_loss_interval), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
}
-EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new);
+void tfrc_li_exit(void)
+{
+ if (tfrc_lh_slab != NULL) {
+ kmem_cache_destroy(tfrc_lh_slab);
+ tfrc_lh_slab = NULL;
+ }
+}
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 417d9d82df3..57f631a86cc 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -1,10 +1,9 @@
#ifndef _DCCP_LI_HIST_
#define _DCCP_LI_HIST_
/*
- * net/dccp/ccids/lib/loss_interval.h
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* This program is free software; you can redistribute it and/or modify it
@@ -12,50 +11,63 @@
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*/
-
-#include <linux/config.h>
+#include <linux/ktime.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/time.h>
-#define DCCP_LI_HIST_IVAL_F_LENGTH 8
+/*
+ * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
+ * NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
+ */
+#define NINTERVAL 8
+#define LIH_SIZE (NINTERVAL + 1)
-struct dccp_li_hist {
- kmem_cache_t *dccplih_slab;
+/**
+ * tfrc_loss_interval - Loss history record for TFRC-based protocols
+ * @li_seqno: Highest received seqno before the start of loss
+ * @li_ccval: The CCVal belonging to @li_seqno
+ * @li_is_closed: Whether @li_seqno is older than 1 RTT
+ * @li_length: Loss interval sequence length
+ */
+struct tfrc_loss_interval {
+ u64 li_seqno:48,
+ li_ccval:4,
+ li_is_closed:1;
+ u32 li_length;
};
-extern struct dccp_li_hist *dccp_li_hist_new(const char *name);
-extern void dccp_li_hist_delete(struct dccp_li_hist *hist);
-
-struct dccp_li_hist_entry {
- struct list_head dccplih_node;
- u64 dccplih_seqno:48,
- dccplih_win_count:4;
- u32 dccplih_interval;
+/**
+ * tfrc_loss_hist - Loss record database
+ * @ring: Circular queue managed in LIFO manner
+ * @counter: Current count of entries (can be more than %LIH_SIZE)
+ * @i_mean: Current Average Loss Interval [RFC 3448, 5.4]
+ */
+struct tfrc_loss_hist {
+ struct tfrc_loss_interval *ring[LIH_SIZE];
+ u8 counter;
+ u32 i_mean;
};
-static inline struct dccp_li_hist_entry *
- dccp_li_hist_entry_new(struct dccp_li_hist *hist,
- const gfp_t prio)
+static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
+{
+ memset(lh, 0, sizeof(struct tfrc_loss_hist));
+}
+
+static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
{
- return kmem_cache_alloc(hist->dccplih_slab, prio);
+ return lh->counter > 0;
}
-static inline void dccp_li_hist_entry_delete(struct dccp_li_hist *hist,
- struct dccp_li_hist_entry *entry)
+static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
{
- if (entry != NULL)
- kmem_cache_free(hist->dccplih_slab, entry);
+ return min(lh->counter, (u8)LIH_SIZE);
}
-extern void dccp_li_hist_purge(struct dccp_li_hist *hist,
- struct list_head *list);
+struct tfrc_rx_hist;
-extern u32 dccp_li_hist_calc_i_mean(struct list_head *list);
+int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
+ u32 (*first_li)(struct sock *), struct sock *);
+u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
+void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
-extern struct dccp_li_hist_entry *
- dccp_li_hist_interval_new(struct dccp_li_hist *hist,
- struct list_head *list,
- const u64 seq_loss,
- const u8 win_loss);
#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
index d3f9d205383..08df7a3acb3 100644
--- a/net/dccp/ccids/lib/packet_history.c
+++ b/net/dccp/ccids/lib/packet_history.c
@@ -1,13 +1,12 @@
/*
- * net/dccp/packet_history.h
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
*
* An implementation of the DCCP protocol
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
@@ -34,365 +33,417 @@
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#include <linux/config.h>
-#include <linux/module.h>
#include <linux/string.h>
-
+#include <linux/slab.h>
#include "packet_history.h"
+#include "../../dccp.h"
-struct dccp_rx_hist *dccp_rx_hist_new(const char *name)
+/*
+ * Transmitter History Routines
+ */
+static struct kmem_cache *tfrc_tx_hist_slab;
+
+int __init tfrc_tx_packet_history_init(void)
{
- struct dccp_rx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
- static const char dccp_rx_hist_mask[] = "rx_hist_%s";
- char *slab_name;
-
- if (hist == NULL)
- goto out;
-
- slab_name = kmalloc(strlen(name) + sizeof(dccp_rx_hist_mask) - 1,
- GFP_ATOMIC);
- if (slab_name == NULL)
- goto out_free_hist;
-
- sprintf(slab_name, dccp_rx_hist_mask, name);
- hist->dccprxh_slab = kmem_cache_create(slab_name,
- sizeof(struct dccp_rx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (hist->dccprxh_slab == NULL)
- goto out_free_slab_name;
-out:
- return hist;
-out_free_slab_name:
- kfree(slab_name);
-out_free_hist:
- kfree(hist);
- hist = NULL;
- goto out;
+ tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
+ sizeof(struct tfrc_tx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
}
-EXPORT_SYMBOL_GPL(dccp_rx_hist_new);
+void tfrc_tx_packet_history_exit(void)
+{
+ if (tfrc_tx_hist_slab != NULL) {
+ kmem_cache_destroy(tfrc_tx_hist_slab);
+ tfrc_tx_hist_slab = NULL;
+ }
+}
+
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
+{
+ struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
+
+ if (entry == NULL)
+ return -ENOBUFS;
+ entry->seqno = seqno;
+ entry->stamp = ktime_get_real();
+ entry->next = *headp;
+ *headp = entry;
+ return 0;
+}
-void dccp_rx_hist_delete(struct dccp_rx_hist *hist)
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
{
- const char* name = kmem_cache_name(hist->dccprxh_slab);
+ struct tfrc_tx_hist_entry *head = *headp;
+
+ while (head != NULL) {
+ struct tfrc_tx_hist_entry *next = head->next;
- kmem_cache_destroy(hist->dccprxh_slab);
- kfree(name);
- kfree(hist);
+ kmem_cache_free(tfrc_tx_hist_slab, head);
+ head = next;
+ }
+
+ *headp = NULL;
}
-EXPORT_SYMBOL_GPL(dccp_rx_hist_delete);
+/*
+ * Receiver History Routines
+ */
+static struct kmem_cache *tfrc_rx_hist_slab;
-void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list)
+int __init tfrc_rx_packet_history_init(void)
{
- struct dccp_rx_hist_entry *entry, *next;
+ tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
+ sizeof(struct tfrc_rx_hist_entry),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
+}
- list_for_each_entry_safe(entry, next, list, dccphrx_node) {
- list_del_init(&entry->dccphrx_node);
- kmem_cache_free(hist->dccprxh_slab, entry);
+void tfrc_rx_packet_history_exit(void)
+{
+ if (tfrc_rx_hist_slab != NULL) {
+ kmem_cache_destroy(tfrc_rx_hist_slab);
+ tfrc_rx_hist_slab = NULL;
}
}
-EXPORT_SYMBOL_GPL(dccp_rx_hist_purge);
+static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
+ const struct sk_buff *skb,
+ const u64 ndp)
+{
+ const struct dccp_hdr *dh = dccp_hdr(skb);
+
+ entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
+ entry->tfrchrx_ccval = dh->dccph_ccval;
+ entry->tfrchrx_type = dh->dccph_type;
+ entry->tfrchrx_ndp = ndp;
+ entry->tfrchrx_tstamp = ktime_get_real();
+}
-struct dccp_rx_hist_entry *
- dccp_rx_hist_find_data_packet(const struct list_head *list)
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
+ const struct sk_buff *skb,
+ const u64 ndp)
{
- struct dccp_rx_hist_entry *entry, *packet = NULL;
+ struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
- list_for_each_entry(entry, list, dccphrx_node)
- if (entry->dccphrx_type == DCCP_PKT_DATA ||
- entry->dccphrx_type == DCCP_PKT_DATAACK) {
- packet = entry;
- break;
- }
+ tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
+}
- return packet;
+/* has the packet contained in skb been seen before? */
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
+{
+ const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
+ int i;
+
+ if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
+ return 1;
+
+ for (i = 1; i <= h->loss_count; i++)
+ if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
+ return 1;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet);
+static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
+{
+ const u8 idx_a = tfrc_rx_hist_index(h, a),
+ idx_b = tfrc_rx_hist_index(h, b);
+ struct tfrc_rx_hist_entry *tmp = h->ring[idx_a];
+
+ h->ring[idx_a] = h->ring[idx_b];
+ h->ring[idx_b] = tmp;
+}
-int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
- struct list_head *rx_list,
- struct list_head *li_list,
- struct dccp_rx_hist_entry *packet)
+/*
+ * Private helper functions for loss detection.
+ *
+ * In the descriptions, `Si' refers to the sequence number of entry number i,
+ * whose NDP count is `Ni' (lower case is used for variables).
+ * Note: All __xxx_loss functions expect that a test against duplicates has been
+ * performed already: the seqno of the skb must not be less than the seqno
+ * of loss_prev; and it must not equal that of any valid history entry.
+ */
+static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
{
- struct dccp_rx_hist_entry *entry, *next, *iter;
- u8 num_later = 0;
-
- iter = dccp_rx_hist_head(rx_list);
- if (iter == NULL)
- dccp_rx_hist_add_entry(rx_list, packet);
- else {
- const u64 seqno = packet->dccphrx_seqno;
-
- if (after48(seqno, iter->dccphrx_seqno))
- dccp_rx_hist_add_entry(rx_list, packet);
- else {
- if (dccp_rx_hist_entry_data_packet(iter))
- num_later = 1;
-
- list_for_each_entry_continue(iter, rx_list,
- dccphrx_node) {
- if (after48(seqno, iter->dccphrx_seqno)) {
- dccp_rx_hist_add_entry(&iter->dccphrx_node,
- packet);
- goto trim_history;
- }
-
- if (dccp_rx_hist_entry_data_packet(iter))
- num_later++;
-
- if (num_later == TFRC_RECV_NUM_LATE_LOSS) {
- dccp_rx_hist_entry_delete(hist, packet);
- return 1;
- }
- }
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = DCCP_SKB_CB(skb)->dccpd_seq;
- if (num_later < TFRC_RECV_NUM_LATE_LOSS)
- dccp_rx_hist_add_entry(rx_list, packet);
- /*
- * FIXME: else what? should we destroy the packet
- * like above?
- */
- }
+ if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
+ h->loss_count = 1;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
}
+}
-trim_history:
- /*
- * Trim history (remove all packets after the NUM_LATE_LOSS + 1
- * data packets)
- */
- num_later = TFRC_RECV_NUM_LATE_LOSS + 1;
-
- if (!list_empty(li_list)) {
- list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
- if (num_later == 0) {
- list_del_init(&entry->dccphrx_node);
- dccp_rx_hist_entry_delete(hist, entry);
- } else if (dccp_rx_hist_entry_data_packet(entry))
- --num_later;
- }
- } else {
- int step = 0;
- u8 win_count = 0; /* Not needed, but lets shut up gcc */
- int tmp;
+static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
+{
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */
+ h->loss_count = 2;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
+ return;
+ }
+
+ /* S0 < S2 < S1 */
+
+ if (dccp_loss_free(s0, s2, n2)) {
+ u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s2, s1, n1)) {
+ /* hole is filled: S0, S2, and S1 are consecutive */
+ h->loss_count = 0;
+ h->loss_start = tfrc_rx_hist_index(h, 1);
+ } else
+ /* gap between S2 and S1: just update loss_prev */
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
+
+ } else { /* gap between S0 and S2 */
/*
- * We have no loss interval history so we need at least one
- * rtt:s of data packets to approximate rtt.
+ * Reorder history to insert S2 between S0 and S1
*/
- list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
- if (num_later == 0) {
- switch (step) {
- case 0:
- step = 1;
- /* OK, find next data packet */
- num_later = 1;
- break;
- case 1:
- step = 2;
- /* OK, find next data packet */
- num_later = 1;
- win_count = entry->dccphrx_ccval;
- break;
- case 2:
- tmp = win_count - entry->dccphrx_ccval;
- if (tmp < 0)
- tmp += TFRC_WIN_COUNT_LIMIT;
- if (tmp > TFRC_WIN_COUNT_PER_RTT + 1) {
- /*
- * We have found a packet older
- * than one rtt remove the rest
- */
- step = 3;
- } else /* OK, find next data packet */
- num_later = 1;
- break;
- case 3:
- list_del_init(&entry->dccphrx_node);
- dccp_rx_hist_entry_delete(hist, entry);
- break;
- }
- } else if (dccp_rx_hist_entry_data_packet(entry))
- --num_later;
- }
+ tfrc_rx_hist_swap(h, 0, 3);
+ h->loss_start = tfrc_rx_hist_index(h, 3);
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
+ h->loss_count = 2;
}
-
- return 0;
}
-EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet);
-
-u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
- struct list_head *li_list, u8 *win_loss)
+/* return 1 if a new loss event has been identified */
+static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
{
- struct dccp_rx_hist_entry *entry, *next, *packet;
- struct dccp_rx_hist_entry *a_loss = NULL;
- struct dccp_rx_hist_entry *b_loss = NULL;
- u64 seq_loss = DCCP_MAX_SEQNO + 1;
- u8 num_later = TFRC_RECV_NUM_LATE_LOSS;
-
- list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) {
- if (num_later == 0) {
- b_loss = entry;
- break;
- } else if (dccp_rx_hist_entry_data_packet(entry))
- --num_later;
+ u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
+ s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+ s3 = DCCP_SKB_CB(skb)->dccpd_seq;
+
+ if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */
+ h->loss_count = 3;
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
+ return 1;
}
- if (b_loss == NULL)
- goto out;
+ /* S3 < S2 */
- num_later = 1;
- list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
- if (num_later == 0) {
- a_loss = entry;
- break;
- } else if (dccp_rx_hist_entry_data_packet(entry))
- --num_later;
+ if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */
+ /*
+ * Reorder history to insert S3 between S1 and S2
+ */
+ tfrc_rx_hist_swap(h, 2, 3);
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
+ h->loss_count = 3;
+ return 1;
}
- if (a_loss == NULL) {
- if (list_empty(li_list)) {
- /* no loss event have occured yet */
- LIMIT_NETDEBUG("%s: TODO: find a lost data packet by "
- "comparing to initial seqno\n",
- __FUNCTION__);
- goto out;
- } else {
- LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!",
- __FUNCTION__);
- goto out;
- }
- }
+ /* S0 < S3 < S1 */
- /* Locate a lost data packet */
- entry = packet = b_loss;
- list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) {
- u64 delta = dccp_delta_seqno(entry->dccphrx_seqno,
- packet->dccphrx_seqno);
-
- if (delta != 0) {
- if (dccp_rx_hist_entry_data_packet(packet))
- --delta;
- /*
- * FIXME: check this, probably this % usage is because
- * in earlier drafts the ndp count was just 8 bits
- * long, but now it cam be up to 24 bits long.
- */
-#if 0
- if (delta % DCCP_NDP_LIMIT !=
- (packet->dccphrx_ndp -
- entry->dccphrx_ndp) % DCCP_NDP_LIMIT)
-#endif
- if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) {
- seq_loss = entry->dccphrx_seqno;
- dccp_inc_seqno(&seq_loss);
+ if (dccp_loss_free(s0, s3, n3)) {
+ u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s3, s1, n1)) {
+ /* hole between S0 and S1 filled by S3 */
+ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s1, s2, n2)) {
+ /* entire hole filled by S0, S3, S1, S2 */
+ h->loss_start = tfrc_rx_hist_index(h, 2);
+ h->loss_count = 0;
+ } else {
+ /* gap remains between S1 and S2 */
+ h->loss_start = tfrc_rx_hist_index(h, 1);
+ h->loss_count = 1;
}
- }
- packet = entry;
- if (packet == a_loss)
- break;
+
+ } else /* gap exists between S3 and S1, loss_count stays at 2 */
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
+
+ return 0;
}
-out:
- if (seq_loss != DCCP_MAX_SEQNO + 1)
- *win_loss = a_loss->dccphrx_ccval;
- else
- *win_loss = 0; /* Paranoia */
- return seq_loss;
-}
+ /*
+ * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3
+ * Reorder history to insert S3 between S0 and S1.
+ */
+ tfrc_rx_hist_swap(h, 0, 3);
+ h->loss_start = tfrc_rx_hist_index(h, 3);
+ tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
+ h->loss_count = 3;
-EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss);
+ return 1;
+}
-struct dccp_tx_hist *dccp_tx_hist_new(const char *name)
+/* recycle RX history records to continue loss detection if necessary */
+static void __three_after_loss(struct tfrc_rx_hist *h)
{
- struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC);
- static const char dccp_tx_hist_mask[] = "tx_hist_%s";
- char *slab_name;
-
- if (hist == NULL)
- goto out;
-
- slab_name = kmalloc(strlen(name) + sizeof(dccp_tx_hist_mask) - 1,
- GFP_ATOMIC);
- if (slab_name == NULL)
- goto out_free_hist;
-
- sprintf(slab_name, dccp_tx_hist_mask, name);
- hist->dccptxh_slab = kmem_cache_create(slab_name,
- sizeof(struct dccp_tx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN,
- NULL, NULL);
- if (hist->dccptxh_slab == NULL)
- goto out_free_slab_name;
-out:
- return hist;
-out_free_slab_name:
- kfree(slab_name);
-out_free_hist:
- kfree(hist);
- hist = NULL;
- goto out;
+ /*
+ * At this stage we know already that there is a gap between S0 and S1
+ * (since S0 was the highest sequence number received before detecting
+ * the loss). To recycle the loss record, it is thus only necessary to
+ * check for other possible gaps between S1/S2 and between S2/S3.
+ */
+ u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
+ s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
+ s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
+ u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
+ n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
+
+ if (dccp_loss_free(s1, s2, n2)) {
+
+ if (dccp_loss_free(s2, s3, n3)) {
+ /* no gap between S2 and S3: entire hole is filled */
+ h->loss_start = tfrc_rx_hist_index(h, 3);
+ h->loss_count = 0;
+ } else {
+ /* gap between S2 and S3 */
+ h->loss_start = tfrc_rx_hist_index(h, 2);
+ h->loss_count = 1;
+ }
+
+ } else { /* gap between S1 and S2 */
+ h->loss_start = tfrc_rx_hist_index(h, 1);
+ h->loss_count = 2;
+ }
}
-EXPORT_SYMBOL_GPL(dccp_tx_hist_new);
+/**
+ * tfrc_rx_handle_loss - Loss detection and further processing
+ * @h: The non-empty RX history object
+ * @lh: Loss Intervals database to update
+ * @skb: Currently received packet
+ * @ndp: The NDP count belonging to @skb
+ * @calc_first_li: Caller-dependent computation of first loss interval in @lh
+ * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
+ *
+ * Chooses action according to pending loss, updates LI database when a new
+ * loss was detected, and does required post-processing. Returns 1 when caller
+ * should send feedback, 0 otherwise.
+ * Since it also takes care of reordering during loss detection and updates the
+ * records accordingly, the caller should not perform any more RX history
+ * operations when loss_count is greater than 0 after calling this function.
+ */
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
+ struct tfrc_loss_hist *lh,
+ struct sk_buff *skb, const u64 ndp,
+ u32 (*calc_first_li)(struct sock *), struct sock *sk)
+{
+ int is_new_loss = 0;
+
+ if (h->loss_count == 0) {
+ __do_track_loss(h, skb, ndp);
+ } else if (h->loss_count == 1) {
+ __one_after_loss(h, skb, ndp);
+ } else if (h->loss_count != 2) {
+ DCCP_BUG("invalid loss_count %d", h->loss_count);
+ } else if (__two_after_loss(h, skb, ndp)) {
+ /*
+ * Update Loss Interval database and recycle RX records
+ */
+ is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
+ __three_after_loss(h);
+ }
+ return is_new_loss;
+}
-void dccp_tx_hist_delete(struct dccp_tx_hist *hist)
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
{
- const char* name = kmem_cache_name(hist->dccptxh_slab);
+ int i;
- kmem_cache_destroy(hist->dccptxh_slab);
- kfree(name);
- kfree(hist);
-}
+ for (i = 0; i <= TFRC_NDUPACK; i++) {
+ h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
+ if (h->ring[i] == NULL)
+ goto out_free;
+ }
-EXPORT_SYMBOL_GPL(dccp_tx_hist_delete);
+ h->loss_count = h->loss_start = 0;
+ return 0;
+
+out_free:
+ while (i-- != 0) {
+ kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+ h->ring[i] = NULL;
+ }
+ return -ENOBUFS;
+}
-struct dccp_tx_hist_entry *
- dccp_tx_hist_find_entry(const struct list_head *list, const u64 seq)
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
{
- struct dccp_tx_hist_entry *packet = NULL, *entry;
+ int i;
- list_for_each_entry(entry, list, dccphtx_node)
- if (entry->dccphtx_seqno == seq) {
- packet = entry;
- break;
+ for (i = 0; i <= TFRC_NDUPACK; ++i)
+ if (h->ring[i] != NULL) {
+ kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
+ h->ring[i] = NULL;
}
-
- return packet;
}
-EXPORT_SYMBOL_GPL(dccp_tx_hist_find_entry);
-
-void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
- struct list_head *list,
- struct dccp_tx_hist_entry *packet)
+/**
+ * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
{
- struct dccp_tx_hist_entry *next;
-
- list_for_each_entry_safe_continue(packet, next, list, dccphtx_node) {
- list_del_init(&packet->dccphtx_node);
- dccp_tx_hist_entry_delete(hist, packet);
- }
+ return h->ring[0];
}
-EXPORT_SYMBOL_GPL(dccp_tx_hist_purge_older);
+/**
+ * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
+{
+ return h->ring[h->rtt_sample_prev];
+}
-void dccp_tx_hist_purge(struct dccp_tx_hist *hist, struct list_head *list)
+/**
+ * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
+ * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
+ * to compute a sample with given data - calling function should check this.
+ */
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
{
- struct dccp_tx_hist_entry *entry, *next;
+ u32 sample = 0,
+ delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+
+ if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
+ if (h->rtt_sample_prev == 2) { /* previous candidate stored */
+ sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+ if (sample)
+ sample = 4 / sample *
+ ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
+ else /*
+ * FIXME: This condition is in principle not
+ * possible but occurs when CCID is used for
+ * two-way data traffic. I have tried to trace
+ * it, but the cause does not seem to be here.
+ */
+ DCCP_BUG("please report to dccp@vger.kernel.org"
+ " => prev = %u, last = %u",
+ tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
+ tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
+ } else if (delta_v < 1) {
+ h->rtt_sample_prev = 1;
+ goto keep_ref_for_next_time;
+ }
- list_for_each_entry_safe(entry, next, list, dccphtx_node) {
- list_del_init(&entry->dccphtx_node);
- dccp_tx_hist_entry_delete(hist, entry);
+ } else if (delta_v == 4) /* optimal match */
+ sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
+ else { /* suboptimal match */
+ h->rtt_sample_prev = 2;
+ goto keep_ref_for_next_time;
}
-}
-EXPORT_SYMBOL_GPL(dccp_tx_hist_purge);
+ if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
+ DCCP_WARN("RTT sample %u too large, using max\n", sample);
+ sample = DCCP_SANE_RTT_MAX;
+ }
+
+ h->rtt_sample_prev = 0; /* use current entry as next reference */
+keep_ref_for_next_time:
-MODULE_AUTHOR("Ian McDonald <iam4@cs.waikato.ac.nz>, "
- "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>");
-MODULE_DESCRIPTION("DCCP TFRC library");
-MODULE_LICENSE("GPL");
+ return sample;
+}
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index 122e96737ff..ee362b0b630 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -1,13 +1,12 @@
/*
- * net/dccp/packet_history.h
+ * Packet RX/TX history data structures and routines for TFRC-based protocols.
*
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- *
- * An implementation of the DCCP protocol
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
*
* This code has been developed by the University of Waikato WAND
* research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - iam4@cs.waikato.ac.nz
+ * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
*
* This code also uses code from Lulea University, rereleased as GPL by its
* authors:
@@ -37,164 +36,120 @@
#ifndef _DCCP_PKT_HIST_
#define _DCCP_PKT_HIST_
-#include <linux/config.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include <linux/time.h>
-
-#include "../../dccp.h"
-
-/* Number of later packets received before one is considered lost */
-#define TFRC_RECV_NUM_LATE_LOSS 3
-
-#define TFRC_WIN_COUNT_PER_RTT 4
-#define TFRC_WIN_COUNT_LIMIT 16
-
-struct dccp_tx_hist_entry {
- struct list_head dccphtx_node;
- u64 dccphtx_seqno:48,
- dccphtx_ccval:4,
- dccphtx_sent:1;
- u32 dccphtx_rtt;
- struct timeval dccphtx_tstamp;
-};
-
-struct dccp_rx_hist_entry {
- struct list_head dccphrx_node;
- u64 dccphrx_seqno:48,
- dccphrx_ccval:4,
- dccphrx_type:4;
- u32 dccphrx_ndp; /* In fact it is from 8 to 24 bits */
- struct timeval dccphrx_tstamp;
-};
-
-struct dccp_tx_hist {
- kmem_cache_t *dccptxh_slab;
-};
+#include "tfrc.h"
-extern struct dccp_tx_hist *dccp_tx_hist_new(const char *name);
-extern void dccp_tx_hist_delete(struct dccp_tx_hist *hist);
-
-struct dccp_rx_hist {
- kmem_cache_t *dccprxh_slab;
+/**
+ * tfrc_tx_hist_entry - Simple singly-linked TX history list
+ * @next: next oldest entry (LIFO order)
+ * @seqno: sequence number of this entry
+ * @stamp: send time of packet with sequence number @seqno
+ */
+struct tfrc_tx_hist_entry {
+ struct tfrc_tx_hist_entry *next;
+ u64 seqno;
+ ktime_t stamp;
};
-extern struct dccp_rx_hist *dccp_rx_hist_new(const char *name);
-extern void dccp_rx_hist_delete(struct dccp_rx_hist *hist);
-extern struct dccp_rx_hist_entry *
- dccp_rx_hist_find_data_packet(const struct list_head *list);
-
-static inline struct dccp_tx_hist_entry *
- dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
- const gfp_t prio)
+static inline struct tfrc_tx_hist_entry *
+ tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
{
- struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
- prio);
-
- if (entry != NULL)
- entry->dccphtx_sent = 0;
-
- return entry;
+ while (head != NULL && head->seqno != seqno)
+ head = head->next;
+ return head;
}
-static inline void dccp_tx_hist_entry_delete(struct dccp_tx_hist *hist,
- struct dccp_tx_hist_entry *entry)
-{
- if (entry != NULL)
- kmem_cache_free(hist->dccptxh_slab, entry);
-}
+int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
+void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-extern struct dccp_tx_hist_entry *
- dccp_tx_hist_find_entry(const struct list_head *list,
- const u64 seq);
+/* Subtraction a-b modulo-16, respects circular wrap-around */
+#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
-static inline void dccp_tx_hist_add_entry(struct list_head *list,
- struct dccp_tx_hist_entry *entry)
-{
- list_add(&entry->dccphtx_node, list);
-}
+/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
+#define TFRC_NDUPACK 3
-extern void dccp_tx_hist_purge_older(struct dccp_tx_hist *hist,
- struct list_head *list,
- struct dccp_tx_hist_entry *next);
+/**
+ * tfrc_rx_hist_entry - Store information about a single received packet
+ * @tfrchrx_seqno: DCCP packet sequence number
+ * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1)
+ * @tfrchrx_ndp: the NDP count (if any) of the packet
+ * @tfrchrx_tstamp: actual receive time of packet
+ */
+struct tfrc_rx_hist_entry {
+ u64 tfrchrx_seqno:48,
+ tfrchrx_ccval:4,
+ tfrchrx_type:4;
+ u64 tfrchrx_ndp:48;
+ ktime_t tfrchrx_tstamp;
+};
-extern void dccp_tx_hist_purge(struct dccp_tx_hist *hist,
- struct list_head *list);
+/**
+ * tfrc_rx_hist - RX history structure for TFRC-based protocols
+ * @ring: Packet history for RTT sampling and loss detection
+ * @loss_count: Number of entries in circular history
+ * @loss_start: Movable index (for loss detection)
+ * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
+ */
+struct tfrc_rx_hist {
+ struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
+ u8 loss_count:2,
+ loss_start:2;
+#define rtt_sample_prev loss_start
+};
-static inline struct dccp_tx_hist_entry *
- dccp_tx_hist_head(struct list_head *list)
+/**
+ * tfrc_rx_hist_index - index to reach n-th entry after loss_start
+ */
+static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
{
- struct dccp_tx_hist_entry *head = NULL;
-
- if (!list_empty(list))
- head = list_entry(list->next, struct dccp_tx_hist_entry,
- dccphtx_node);
- return head;
+ return (h->loss_start + n) & TFRC_NDUPACK;
}
-static inline struct dccp_rx_hist_entry *
- dccp_rx_hist_entry_new(struct dccp_rx_hist *hist,
- const struct sock *sk,
- const u32 ndp,
- const struct sk_buff *skb,
- const gfp_t prio)
+/**
+ * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
{
- struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
- prio);
-
- if (entry != NULL) {
- const struct dccp_hdr *dh = dccp_hdr(skb);
-
- entry->dccphrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- entry->dccphrx_ccval = dh->dccph_ccval;
- entry->dccphrx_type = dh->dccph_type;
- entry->dccphrx_ndp = ndp;
- dccp_timestamp(sk, &entry->dccphrx_tstamp);
- }
-
- return entry;
+ return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
}
-static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist,
- struct dccp_rx_hist_entry *entry)
+/**
+ * tfrc_rx_hist_entry - return the n-th history entry after loss_start
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
{
- if (entry != NULL)
- kmem_cache_free(hist->dccprxh_slab, entry);
+ return h->ring[tfrc_rx_hist_index(h, n)];
}
-extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist,
- struct list_head *list);
-
-static inline void dccp_rx_hist_add_entry(struct list_head *list,
- struct dccp_rx_hist_entry *entry)
+/**
+ * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
+ */
+static inline struct tfrc_rx_hist_entry *
+ tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
{
- list_add(&entry->dccphrx_node, list);
+ return h->ring[h->loss_start];
}
-static inline struct dccp_rx_hist_entry *
- dccp_rx_hist_head(struct list_head *list)
+/* indicate whether previously a packet was detected missing */
+static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
{
- struct dccp_rx_hist_entry *head = NULL;
-
- if (!list_empty(list))
- head = list_entry(list->next, struct dccp_rx_hist_entry,
- dccphrx_node);
- return head;
+ return h->loss_count > 0;
}
-static inline int
- dccp_rx_hist_entry_data_packet(const struct dccp_rx_hist_entry *entry)
-{
- return entry->dccphrx_type == DCCP_PKT_DATA ||
- entry->dccphrx_type == DCCP_PKT_DATAACK;
-}
+void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
+ const u64 ndp);
-extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist,
- struct list_head *rx_list,
- struct list_head *li_list,
- struct dccp_rx_hist_entry *packet);
+int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
-extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list,
- struct list_head *li_list, u8 *win_loss);
+struct tfrc_loss_hist;
+int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
+ struct sk_buff *skb, const u64 ndp,
+ u32 (*first_li)(struct sock *sk), struct sock *sk);
+u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
+int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
+void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
new file mode 100644
index 00000000000..62b5828acde
--- /dev/null
+++ b/net/dccp/ccids/lib/tfrc.c
@@ -0,0 +1,45 @@
+/*
+ * TFRC library initialisation
+ *
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ */
+#include <linux/moduleparam.h>
+#include "tfrc.h"
+
+#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
+bool tfrc_debug;
+module_param(tfrc_debug, bool, 0644);
+MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
+#endif
+
+int __init tfrc_lib_init(void)
+{
+ int rc = tfrc_li_init();
+
+ if (rc)
+ goto out;
+
+ rc = tfrc_tx_packet_history_init();
+ if (rc)
+ goto out_free_loss_intervals;
+
+ rc = tfrc_rx_packet_history_init();
+ if (rc)
+ goto out_free_tx_history;
+ return 0;
+
+out_free_tx_history:
+ tfrc_tx_packet_history_exit();
+out_free_loss_intervals:
+ tfrc_li_exit();
+out:
+ return rc;
+}
+
+void tfrc_lib_exit(void)
+{
+ tfrc_rx_packet_history_exit();
+ tfrc_tx_packet_history_exit();
+ tfrc_li_exit();
+}
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
index 130c4c40cfe..40ee7d62b65 100644
--- a/net/dccp/ccids/lib/tfrc.h
+++ b/net/dccp/ccids/lib/tfrc.h
@@ -1,22 +1,77 @@
#ifndef _TFRC_H_
#define _TFRC_H_
/*
- * net/dccp/ccids/lib/tfrc.h
- *
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
+ * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
+ * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
+ * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
+ * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
-
#include <linux/types.h>
+#include <linux/math64.h>
+#include "../../dccp.h"
+
+/* internal includes that this library exports: */
+#include "loss_interval.h"
+#include "packet_history.h"
+
+#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
+extern bool tfrc_debug;
+#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
+#else
+#define tfrc_pr_debug(format, a...)
+#endif
+
+/* integer-arithmetic divisions of type (a * 1000000)/b */
+static inline u64 scaled_div(u64 a, u64 b)
+{
+ BUG_ON(b == 0);
+ return div64_u64(a * 1000000, b);
+}
+
+static inline u32 scaled_div32(u64 a, u64 b)
+{
+ u64 result = scaled_div(a, b);
+
+ if (result > UINT_MAX) {
+ DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
+ (unsigned long long)a, (unsigned long long)b);
+ return UINT_MAX;
+ }
+ return result;
+}
+
+/**
+ * tfrc_ewma - Exponentially weighted moving average
+ * @weight: Weight to be used as damping factor, in units of 1/10
+ */
+static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
+{
+ return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
+}
+
+u32 tfrc_calc_x(u16 s, u32 R, u32 p);
+u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
+
+int tfrc_tx_packet_history_init(void);
+void tfrc_tx_packet_history_exit(void);
+int tfrc_rx_packet_history_init(void);
+void tfrc_rx_packet_history_exit(void);
-extern u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
+int tfrc_li_init(void);
+void tfrc_li_exit(void);
+#ifdef CONFIG_IP_DCCP_TFRC_LIB
+int tfrc_lib_init(void);
+void tfrc_lib_exit(void);
+#else
+#define tfrc_lib_init() (0)
+#define tfrc_lib_exit()
+#endif
#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
index d2b5933b451..88ef98285be 100644
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ b/net/dccp/ccids/lib/tfrc_equation.c
@@ -1,8 +1,6 @@
/*
- * net/dccp/ccids/lib/tfrc_equation.c
- *
* Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
* Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
*
@@ -12,19 +10,84 @@
* (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/module.h>
-
-#include <asm/bug.h>
-#include <asm/div64.h>
-
+#include "../../dccp.h"
#include "tfrc.h"
#define TFRC_CALC_X_ARRSIZE 500
+#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
+#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
-#define TFRC_CALC_X_SPLIT 50000
-/* equivalent to 0.05 */
-
+/*
+ TFRC TCP Reno Throughput Equation Lookup Table for f(p)
+
+ The following two-column lookup table implements a part of the TCP throughput
+ equation from [RFC 3448, sec. 3.1]:
+
+ s
+ X_calc = --------------------------------------------------------------
+ R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
+
+ Where:
+ X is the transmit rate in bytes/second
+ s is the packet size in bytes
+ R is the round trip time in seconds
+ p is the loss event rate, between 0 and 1.0, of the number of loss
+ events as a fraction of the number of packets transmitted
+ t_RTO is the TCP retransmission timeout value in seconds
+ b is the number of packets acknowledged by a single TCP ACK
+
+ We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
+
+ s
+ X_calc = -------------------------------------------------------
+ R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
+
+ which we can break down into:
+
+ s
+ X_calc = ---------
+ R * f(p)
+
+ where f(p) is given for 0 < p <= 1 by:
+
+ f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
+
+ Since this is kernel code, floating-point arithmetic is avoided in favour of
+ integer arithmetic. This means that nearly all fractional parameters are
+ scaled by 1000000:
+ * the parameters p and R
+ * the return result f(p)
+ The lookup table therefore actually tabulates the following function g(q):
+
+ g(q) = 1000000 * f(q/1000000)
+
+ Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
+ granularity for the practically more relevant case of small values of p (up to
+ 5%), the second column is used; the first one ranges up to 100%. This split
+ corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
+ determines the smallest resolution possible with this lookup table:
+
+ TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
+
+ The entire table is generated by:
+ for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
+ lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
+ lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
+ }
+
+ With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
+ lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
+ lookup[M][0] = g(1000000) = 1000000 * f(100%)
+ lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
+ lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
+
+ In summary, the two columns represent f(p) for the following ranges:
+ * The first column is for 0.002 <= p <= 1.0
+ * The second column is for 0.0001 <= p <= 0.05
+ Where the columns overlap, the second (finer-grained) is given preference,
+ i.e. the first column is used only for p >= 0.05.
+ */
static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 37172, 8172 },
{ 53499, 11567 },
@@ -528,117 +591,115 @@ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
{ 243315981, 271305 }
};
-/* Calculate the send rate as per section 3.1 of RFC3448
-
-Returns send rate in bytes per second
-
-Integer maths and lookups are used as not allowed floating point in kernel
-
-The function for Xcalc as per section 3.1 of RFC3448 is:
-
-X = s
- -------------------------------------------------------------
- R*sqrt(2*b*p/3) + (t_RTO * (3*sqrt(3*b*p/8) * p * (1+32*p^2)))
-
-where
-X is the trasmit rate in bytes/second
-s is the packet size in bytes
-R is the round trip time in seconds
-p is the loss event rate, between 0 and 1.0, of the number of loss events
- as a fraction of the number of packets transmitted
-t_RTO is the TCP retransmission timeout value in seconds
-b is the number of packets acknowledged by a single TCP acknowledgement
-
-we can assume that b = 1 and t_RTO is 4 * R. With this the equation becomes:
-
-X = s
- -----------------------------------------------------------------------
- R * sqrt(2 * p / 3) + (12 * R * (sqrt(3 * p / 8) * p * (1 + 32 * p^2)))
-
-
-which we can break down into:
-
-X = s
- --------
- R * f(p)
-
-where f(p) = sqrt(2 * p / 3) + (12 * sqrt(3 * p / 8) * p * (1 + 32 * p * p))
-
-Function parameters:
-s - bytes
-R - RTT in usecs
-p - loss rate (decimal fraction multiplied by 1,000,000)
-
-Returns Xcalc in bytes per second
-
-DON'T alter this code unless you run test cases against it as the code
-has been manipulated to stop underflow/overlow.
+/* return largest index i such that fval <= lookup[i][small] */
+static inline u32 tfrc_binsearch(u32 fval, u8 small)
+{
+ u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
+
+ while (low < high) {
+ try = (low + high) / 2;
+ if (fval <= tfrc_calc_x_lookup[try][small])
+ high = try;
+ else
+ low = try + 1;
+ }
+ return high;
+}
-*/
+/**
+ * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
+ * @s: packet size in bytes
+ * @R: RTT scaled by 1000000 (i.e., microseconds)
+ * @p: loss ratio estimate scaled by 1000000
+ *
+ * Returns X_calc in bytes per second (not scaled).
+ */
u32 tfrc_calc_x(u16 s, u32 R, u32 p)
{
- int index;
+ u16 index;
u32 f;
- u64 tmp1, tmp2;
+ u64 result;
+
+ /* check against invalid parameters and divide-by-zero */
+ BUG_ON(p > 1000000); /* p must not exceed 100% */
+ BUG_ON(p == 0); /* f(0) = 0, divide by zero */
+ if (R == 0) { /* possible divide by zero */
+ DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
+ return ~0U;
+ }
+
+ if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
+ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
+ DCCP_WARN("Value of p (%d) below resolution. "
+ "Substituting %d\n", p, TFRC_SMALLEST_P);
+ index = 0;
+ } else /* 0.0001 <= p <= 0.05 */
+ index = p/TFRC_SMALLEST_P - 1;
- if (p < TFRC_CALC_X_SPLIT)
- index = (p / (TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE)) - 1;
- else
- index = (p / (1000000 / TFRC_CALC_X_ARRSIZE)) - 1;
-
- if (index < 0)
- /* p should be 0 unless there is a bug in my code */
- index = 0;
-
- if (R == 0)
- R = 1; /* RTT can't be zero or else divide by zero */
-
- BUG_ON(index >= TFRC_CALC_X_ARRSIZE);
-
- if (p >= TFRC_CALC_X_SPLIT)
- f = tfrc_calc_x_lookup[index][0];
- else
f = tfrc_calc_x_lookup[index][1];
- tmp1 = ((u64)s * 100000000);
- tmp2 = ((u64)R * (u64)f);
- do_div(tmp2, 10000);
- do_div(tmp1, tmp2);
- /* Don't alter above math unless you test due to overflow on 32 bit */
+ } else { /* 0.05 < p <= 1.00 */
+ index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
- return (u32)tmp1;
+ f = tfrc_calc_x_lookup[index][0];
+ }
+
+ /*
+ * Compute X = s/(R*f(p)) in bytes per second.
+ * Since f(p) and R are both scaled by 1000000, we need to multiply by
+ * 1000000^2. To avoid overflow, the result is computed in two stages.
+ * This works under almost all reasonable operational conditions, for a
+ * wide range of parameters. Yet, should some strange combination of
+ * parameters result in overflow, the use of scaled_div32 will catch
+ * this and return UINT_MAX - which is a logically adequate consequence.
+ */
+ result = scaled_div(s, R);
+ return scaled_div32(result, f);
}
-EXPORT_SYMBOL_GPL(tfrc_calc_x);
-
-/*
- * args: fvalue - function value to match
- * returns: p closest to that value
+/**
+ * tfrc_calc_x_reverse_lookup - try to find p given f(p)
+ * @fvalue: function value to match, scaled by 1000000
*
- * both fvalue and p are multiplied by 1,000,000 to use ints
+ * Returns closest match for p, also scaled by 1000000
*/
u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
{
- int ctr = 0;
- int small;
+ int index;
- if (fvalue < tfrc_calc_x_lookup[0][1])
+ if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
return 0;
- if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1])
- small = 1;
- else if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0])
+ /* Error cases. */
+ if (fvalue < tfrc_calc_x_lookup[0][1]) {
+ DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
+ return TFRC_SMALLEST_P;
+ }
+ if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
+ DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
return 1000000;
- else
- small = 0;
+ }
- while (fvalue > tfrc_calc_x_lookup[ctr][small])
- ctr++;
+ if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
+ index = tfrc_binsearch(fvalue, 1);
+ return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
+ }
- if (small)
- return TFRC_CALC_X_SPLIT * ctr / TFRC_CALC_X_ARRSIZE;
- else
- return 1000000 * ctr / TFRC_CALC_X_ARRSIZE;
+ /* else ... it must be in the coarse-grained column */
+ index = tfrc_binsearch(fvalue, 0);
+ return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
}
-EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup);
+/**
+ * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
+ * When @loss_event_rate is large, there is a chance that p is truncated to 0.
+ * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
+ */
+u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
+{
+ if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
+ return 0;
+ if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
+ return 1000000;
+ return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
+}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 93f26dd6e6c..c67816647cc 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -5,73 +5,138 @@
*
* An implementation of the DCCP protocol
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
+#include <linux/ktime.h>
#include <net/snmp.h>
#include <net/sock.h>
#include <net/tcp.h>
#include "ackvec.h"
+/*
+ * DCCP - specific warning and debugging macros.
+ */
+#define DCCP_WARN(fmt, a...) LIMIT_NETDEBUG(KERN_WARNING "%s: " fmt, \
+ __func__, ##a)
+#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
+ __FILE__, __LINE__, __func__)
+#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
+#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
+ DCCP_BUG("\"%s\" holds (exception!)", \
+ __stringify(cond)); \
+ } while (0)
+
+#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \
+ printk(fmt, ##args); \
+ } while(0)
+#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
+ "%s: " fmt, __func__, ##a)
+
#ifdef CONFIG_IP_DCCP_DEBUG
-extern int dccp_debug;
-
-#define dccp_pr_debug(format, a...) \
- do { if (dccp_debug) \
- printk(KERN_DEBUG "%s: " format, __FUNCTION__ , ##a); \
- } while (0)
-#define dccp_pr_debug_cat(format, a...) do { if (dccp_debug) \
- printk(format, ##a); } while (0)
+extern bool dccp_debug;
+#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
+#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
+#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
#else
#define dccp_pr_debug(format, a...)
#define dccp_pr_debug_cat(format, a...)
+#define dccp_debug(format, a...)
#endif
extern struct inet_hashinfo dccp_hashinfo;
-extern atomic_t dccp_orphan_count;
-extern int dccp_tw_count;
-extern void dccp_tw_deschedule(struct inet_timewait_sock *tw);
-
-extern void dccp_time_wait(struct sock *sk, int state, int timeo);
+extern struct percpu_counter dccp_orphan_count;
-/* FIXME: Right size this */
-#define DCCP_MAX_OPT_LEN 128
+void dccp_time_wait(struct sock *sk, int state, int timeo);
-#define DCCP_MAX_PACKET_HDR 32
+/*
+ * Set safe upper bounds for header and option length. Since Data Offset is 8
+ * bits (RFC 4340, sec. 5.1), the total header length can never be more than
+ * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
+ * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR
+ * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
+ * Hence a safe upper bound for the maximum option length is 1020-28 = 992
+ */
+#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
+#define DCCP_MAX_PACKET_HDR 28
+#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
+#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
-#define MAX_DCCP_HEADER (DCCP_MAX_PACKET_HDR + DCCP_MAX_OPT_LEN + MAX_HEADER)
+/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
+#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
-/* draft-ietf-dccp-spec-11.txt initial RTO value */
-#define DCCP_TIMEOUT_INIT ((unsigned)(3 * HZ))
+/* RFC 1122, 4.2.3.1 initial RTO value */
+#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
+
+/*
+ * The maximum back-off value for retransmissions. This is needed for
+ * - retransmitting client-Requests (sec. 8.1.1),
+ * - retransmitting Close/CloseReq when closing (sec. 8.3),
+ * - feature-negotiation retransmission (sec. 6.6.3),
+ * - Acks in client-PARTOPEN state (sec. 8.1.5).
+ */
+#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
+
+/*
+ * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
+ */
+#define DCCP_SANE_RTT_MIN 100
+#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
+#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
+
+/* sysctl variables for DCCP */
+extern int sysctl_dccp_request_retries;
+extern int sysctl_dccp_retries1;
+extern int sysctl_dccp_retries2;
+extern int sysctl_dccp_tx_qlen;
+extern int sysctl_dccp_sync_ratelimit;
-/* Maximal interval between probes for local resources. */
-#define DCCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ / 2U))
+/*
+ * 48-bit sequence number arithmetic (signed and unsigned)
+ */
+#define INT48_MIN 0x800000000000LL /* 2^47 */
+#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */
+#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */
+#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x)))
+#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x)))
+#define ADD48(a, b) (((a) + (b)) & UINT48_MAX)
+#define SUB48(a, b) ADD48((a), COMPLEMENT48(b))
-#define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */
+static inline void dccp_set_seqno(u64 *seqno, u64 value)
+{
+ *seqno = value & UINT48_MAX;
+}
-extern struct proto dccp_prot;
+static inline void dccp_inc_seqno(u64 *seqno)
+{
+ *seqno = ADD48(*seqno, 1);
+}
+
+/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */
+static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2)
+{
+ u64 delta = SUB48(seqno2, seqno1);
+
+ return TO_SIGNED48(delta);
+}
/* is seq1 < seq2 ? */
static inline int before48(const u64 seq1, const u64 seq2)
{
- return (s64)((seq1 << 16) - (seq2 << 16)) < 0;
+ return (s64)((seq2 << 16) - (seq1 << 16)) > 0;
}
/* is seq1 > seq2 ? */
-static inline int after48(const u64 seq1, const u64 seq2)
-{
- return (s64)((seq2 << 16) - (seq1 << 16)) < 0;
-}
+#define after48(seq1, seq2) before48(seq2, seq1)
/* is seq2 <= seq1 <= seq3 ? */
static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
@@ -84,12 +149,36 @@ static inline u64 max48(const u64 seq1, const u64 seq2)
return after48(seq1, seq2) ? seq1 : seq2;
}
+/**
+ * dccp_loss_count - Approximate the number of lost data packets in a burst loss
+ * @s1: last known sequence number before the loss ('hole')
+ * @s2: first sequence number seen after the 'hole'
+ * @ndp: NDP count on packet with sequence number @s2
+ */
+static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
+{
+ s64 delta = dccp_delta_seqno(s1, s2);
+
+ WARN_ON(delta < 0);
+ delta -= ndp + 1;
+
+ return delta > 0 ? delta : 0;
+}
+
+/**
+ * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
+ */
+static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
+{
+ return dccp_loss_count(s1, s2, ndp) == 0;
+}
+
enum {
DCCP_MIB_NUM = 0,
DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
DCCP_MIB_ESTABRESETS, /* EstabResets */
DCCP_MIB_CURRESTAB, /* CurrEstab */
- DCCP_MIB_OUTSEGS, /* OutSegs */
+ DCCP_MIB_OUTSEGS, /* OutSegs */
DCCP_MIB_OUTRSTS,
DCCP_MIB_ABORTONTIMEOUT,
DCCP_MIB_TIMEOUTS,
@@ -106,171 +195,140 @@ enum {
#define DCCP_MIB_MAX __DCCP_MIB_MAX
struct dccp_mib {
unsigned long mibs[DCCP_MIB_MAX];
-} __SNMP_MIB_ALIGN__;
+};
DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
#define DCCP_INC_STATS_BH(field) SNMP_INC_STATS_BH(dccp_statistics, field)
-#define DCCP_INC_STATS_USER(field) SNMP_INC_STATS_USER(dccp_statistics, field)
#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-#define DCCP_ADD_STATS_BH(field, val) \
- SNMP_ADD_STATS_BH(dccp_statistics, field, val)
-#define DCCP_ADD_STATS_USER(field, val) \
- SNMP_ADD_STATS_USER(dccp_statistics, field, val)
-extern int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb);
+/*
+ * Checksumming routines
+ */
+static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb)
+{
+ const struct dccp_hdr* dh = dccp_hdr(skb);
+
+ if (dh->dccph_cscov == 0)
+ return skb->len;
+ return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
+}
-extern int dccp_send_response(struct sock *sk);
-extern void dccp_send_ack(struct sock *sk);
-extern void dccp_send_delayed_ack(struct sock *sk);
-extern void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type);
+static inline void dccp_csum_outgoing(struct sk_buff *skb)
+{
+ unsigned int cov = dccp_csum_coverage(skb);
+
+ if (cov >= skb->len)
+ dccp_hdr(skb)->dccph_cscov = 0;
-extern int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo);
-extern void dccp_write_space(struct sock *sk);
+ skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
+}
-extern void dccp_init_xmit_timers(struct sock *sk);
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
+
+int dccp_retransmit_skb(struct sock *sk);
+
+void dccp_send_ack(struct sock *sk);
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk);
+
+void dccp_send_sync(struct sock *sk, const u64 seq,
+ const enum dccp_pkt_type pkt_type);
+
+/*
+ * TX Packet Dequeueing Interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
+bool dccp_qpolicy_full(struct sock *sk);
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
+struct sk_buff *dccp_qpolicy_top(struct sock *sk);
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
+
+/*
+ * TX Packet Output and TX Timers
+ */
+void dccp_write_xmit(struct sock *sk);
+void dccp_write_space(struct sock *sk);
+void dccp_flush_write_queue(struct sock *sk, long *time_budget);
+
+void dccp_init_xmit_timers(struct sock *sk);
static inline void dccp_clear_xmit_timers(struct sock *sk)
{
inet_csk_clear_xmit_timers(sk);
}
-extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
+unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
-extern const char *dccp_packet_name(const int type);
-extern const char *dccp_state_name(const int state);
+const char *dccp_packet_name(const int type);
-static inline void dccp_set_state(struct sock *sk, const int state)
-{
- const int oldstate = sk->sk_state;
-
- dccp_pr_debug("%s(%p) %-10.10s -> %s\n",
- dccp_role(sk), sk,
- dccp_state_name(oldstate), dccp_state_name(state));
- WARN_ON(state == oldstate);
-
- switch (state) {
- case DCCP_OPEN:
- if (oldstate != DCCP_OPEN)
- DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
- break;
-
- case DCCP_CLOSED:
- if (oldstate == DCCP_CLOSING || oldstate == DCCP_OPEN)
- DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
-
- sk->sk_prot->unhash(sk);
- if (inet_csk(sk)->icsk_bind_hash != NULL &&
- !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
- inet_put_port(&dccp_hashinfo, sk);
- /* fall through */
- default:
- if (oldstate == DCCP_OPEN)
- DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
- }
-
- /* Change state AFTER socket is unhashed to avoid closed
- * socket sitting in hash tables.
- */
- sk->sk_state = state;
-}
+void dccp_set_state(struct sock *sk, const int state);
+void dccp_done(struct sock *sk);
-static inline void dccp_done(struct sock *sk)
-{
- dccp_set_state(sk, DCCP_CLOSED);
- dccp_clear_xmit_timers(sk);
+int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
+ struct sk_buff const *skb);
- sk->sk_shutdown = SHUTDOWN_MASK;
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- else
- inet_csk_destroy_sock(sk);
-}
+struct sock *dccp_create_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb);
-static inline void dccp_openreq_init(struct request_sock *req,
- struct dccp_sock *dp,
- struct sk_buff *skb)
-{
- /*
- * FIXME: fill in the other req fields from the DCCP options
- * received
- */
- inet_rsk(req)->rmt_port = dccp_hdr(skb)->dccph_sport;
- inet_rsk(req)->acked = 0;
- req->rcv_wnd = 0;
-}
+int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-
-extern struct sock *dccp_create_openreq_child(struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb);
-
-extern int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-
-extern void dccp_v4_err(struct sk_buff *skb, u32);
-
-extern int dccp_v4_rcv(struct sk_buff *skb);
-
-extern struct sock *dccp_v4_request_recv_sock(struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst);
-extern struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct request_sock **prev);
-
-extern int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb);
-extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len);
-extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len);
-
-extern int dccp_v4_init_sock(struct sock *sk);
-extern int dccp_v4_destroy_sock(struct sock *sk);
-
-extern void dccp_close(struct sock *sk, long timeout);
-extern struct sk_buff *dccp_make_response(struct sock *sk,
- struct dst_entry *dst,
- struct request_sock *req);
-extern struct sk_buff *dccp_make_reset(struct sock *sk,
- struct dst_entry *dst,
- enum dccp_reset_codes code);
-
-extern int dccp_connect(struct sock *sk);
-extern int dccp_disconnect(struct sock *sk, int flags);
-extern void dccp_unhash(struct sock *sk);
-extern int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-extern int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen);
-extern int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
-extern int dccp_sendmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t size);
-extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
- struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len);
-extern void dccp_shutdown(struct sock *sk, int how);
-extern int inet_dccp_listen(struct socket *sock, int backlog);
-extern unsigned int dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
-extern void dccp_v4_send_check(struct sock *sk, int len,
- struct sk_buff *skb);
-extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len);
-
-extern int dccp_v4_checksum(const struct sk_buff *skb,
- const u32 saddr, const u32 daddr);
-
-extern int dccp_v4_send_reset(struct sock *sk,
- enum dccp_reset_codes code);
-extern void dccp_send_close(struct sock *sk, const int active);
-extern int dccp_invalid_packet(struct sk_buff *skb);
+struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct dst_entry *dst);
+struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct request_sock **prev);
+
+int dccp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb);
+int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+ struct dccp_hdr *dh, unsigned int len);
+int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len);
+
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
+void dccp_destroy_sock(struct sock *sk);
+
+void dccp_close(struct sock *sk, long timeout);
+struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req);
+
+int dccp_connect(struct sock *sk);
+int dccp_disconnect(struct sock *sk, int flags);
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen);
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen);
+#endif
+int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+ size_t size);
+int dccp_recvmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t len, int nonblock, int flags,
+ int *addr_len);
+void dccp_shutdown(struct sock *sk, int how);
+int inet_dccp_listen(struct socket *sock, int backlog);
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+ poll_table *wait);
+int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
+void dccp_send_close(struct sock *sk, const int active);
+int dccp_invalid_packet(struct sk_buff *skb);
+u32 dccp_sample_rtt(struct sock *sk, long delta);
static inline int dccp_bad_service_code(const struct sock *sk,
- const __u32 service)
+ const __be32 service)
{
const struct dccp_sock *dp = dccp_sk(sk);
@@ -279,10 +337,29 @@ static inline int dccp_bad_service_code(const struct sock *sk,
return !dccp_list_has_service(dp->dccps_service_list, service);
}
+/**
+ * dccp_skb_cb - DCCP per-packet control information
+ * @dccpd_type: one of %dccp_pkt_type (or unknown)
+ * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1
+ * @dccpd_reset_code: one of %dccp_reset_codes
+ * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code)
+ * @dccpd_opt_len: total length of all options (5.8) in the packet
+ * @dccpd_seq: sequence number
+ * @dccpd_ack_seq: acknowledgment number subheader field value
+ *
+ * This is used for transmission as well as for reception.
+ */
struct dccp_skb_cb {
+ union {
+ struct inet_skb_parm h4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet6_skb_parm h6;
+#endif
+ } header;
__u8 dccpd_type:4;
__u8 dccpd_ccval:4;
- __u8 dccpd_reset_code;
+ __u8 dccpd_reset_code,
+ dccpd_reset_data[3];
__u16 dccpd_opt_len;
__u64 dccpd_seq;
__u64 dccpd_ack_seq;
@@ -290,6 +367,7 @@ struct dccp_skb_cb {
#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
+/* RFC 4340, sec. 7.7 */
static inline int dccp_non_data_packet(const struct sk_buff *skb)
{
const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
@@ -302,59 +380,40 @@ static inline int dccp_non_data_packet(const struct sk_buff *skb)
type == DCCP_PKT_SYNCACK;
}
-static inline int dccp_packet_without_ack(const struct sk_buff *skb)
+/* RFC 4340, sec. 7.7 */
+static inline int dccp_data_packet(const struct sk_buff *skb)
{
const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
- return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
+ return type == DCCP_PKT_DATA ||
+ type == DCCP_PKT_DATAACK ||
+ type == DCCP_PKT_REQUEST ||
+ type == DCCP_PKT_RESPONSE;
}
-#define DCCP_MAX_SEQNO ((((u64)1) << 48) - 1)
-#define DCCP_PKT_WITHOUT_ACK_SEQ (DCCP_MAX_SEQNO << 2)
-
-static inline void dccp_set_seqno(u64 *seqno, u64 value)
+static inline int dccp_packet_without_ack(const struct sk_buff *skb)
{
- if (value > DCCP_MAX_SEQNO)
- value -= DCCP_MAX_SEQNO + 1;
- *seqno = value;
-}
+ const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-static inline u64 dccp_delta_seqno(u64 seqno1, u64 seqno2)
-{
- return ((seqno2 << 16) - (seqno1 << 16)) >> 16;
+ return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
}
-static inline void dccp_inc_seqno(u64 *seqno)
-{
- if (++*seqno > DCCP_MAX_SEQNO)
- *seqno = 0;
-}
+#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2)
static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
{
struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
sizeof(*dh));
-
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- dh->dccph_seq = htonl((gss >> 32)) >> 8;
-#elif defined(__BIG_ENDIAN_BITFIELD)
- dh->dccph_seq = htonl((gss >> 32));
-#else
-#error "Adjust your <asm/byteorder.h> defines"
-#endif
+ dh->dccph_seq2 = 0;
+ dh->dccph_seq = htons((gss >> 32) & 0xfffff);
dhx->dccph_seq_low = htonl(gss & 0xffffffff);
}
static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
const u64 gsr)
{
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- dhack->dccph_ack_nr_high = htonl((gsr >> 32)) >> 8;
-#elif defined(__BIG_ENDIAN_BITFIELD)
- dhack->dccph_ack_nr_high = htonl((gsr >> 32));
-#else
-#error "Adjust your <asm/byteorder.h> defines"
-#endif
+ dhack->dccph_reserved1 = 0;
+ dhack->dccph_ack_nr_high = htons(gsr >> 32);
dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
}
@@ -362,86 +421,81 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
- dp->dccps_gsr = seq;
- dccp_set_seqno(&dp->dccps_swl,
- (dp->dccps_gsr + 1 -
- (dp->dccps_options.dccpo_sequence_window / 4)));
- dccp_set_seqno(&dp->dccps_swh,
- (dp->dccps_gsr +
- (3 * dp->dccps_options.dccpo_sequence_window) / 4));
+ if (after48(seq, dp->dccps_gsr))
+ dp->dccps_gsr = seq;
+ /* Sequence validity window depends on remote Sequence Window (7.5.1) */
+ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
+ /*
+ * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
+ * 7.5.1 we perform this check beyond the initial handshake: W/W' are
+ * always > 32, so for the first W/W' packets in the lifetime of a
+ * connection we always have to adjust SWL.
+ * A second reason why we are doing this is that the window depends on
+ * the feature-remote value of Sequence Window: nothing stops the peer
+ * from updating this value while we are busy adjusting SWL for the
+ * first W packets (we would have to count from scratch again then).
+ * Therefore it is safer to always make sure that the Sequence Window
+ * is not artificially extended by a peer who grows SWL downwards by
+ * continually updating the feature-remote Sequence-Window.
+ * If sequence numbers wrap it is bad luck. But that will take a while
+ * (48 bit), and this measure prevents Sequence-number attacks.
+ */
+ if (before48(dp->dccps_swl, dp->dccps_isr))
+ dp->dccps_swl = dp->dccps_isr;
+ dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
}
static inline void dccp_update_gss(struct sock *sk, u64 seq)
{
struct dccp_sock *dp = dccp_sk(sk);
- dp->dccps_awh = dp->dccps_gss = seq;
- dccp_set_seqno(&dp->dccps_awl,
- (dp->dccps_gss -
- dp->dccps_options.dccpo_sequence_window + 1));
-}
-
-static inline int dccp_ack_pending(const struct sock *sk)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- return dp->dccps_timestamp_echo != 0 ||
-#ifdef CONFIG_IP_DCCP_ACKVEC
- (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) ||
-#endif
- inet_csk_ack_scheduled(sk);
+ dp->dccps_gss = seq;
+ /* Ack validity window depends on local Sequence Window value (7.5.1) */
+ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
+ /* Adjust AWL so that it is not below ISS - see comment above for SWL */
+ if (before48(dp->dccps_awl, dp->dccps_iss))
+ dp->dccps_awl = dp->dccps_iss;
+ dp->dccps_awh = dp->dccps_gss;
}
-extern void dccp_insert_options(struct sock *sk, struct sk_buff *skb);
-extern void dccp_insert_option_elapsed_time(struct sock *sk,
- struct sk_buff *skb,
- u32 elapsed_time);
-extern void dccp_insert_option_timestamp(struct sock *sk,
- struct sk_buff *skb);
-extern void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- unsigned char option,
- const void *value, unsigned char len);
-
-extern struct socket *dccp_ctl_socket;
-
-extern void dccp_timestamp(const struct sock *sk, struct timeval *tv);
-
-static inline suseconds_t timeval_usecs(const struct timeval *tv)
+static inline int dccp_ackvec_pending(const struct sock *sk)
{
- return tv->tv_sec * USEC_PER_SEC + tv->tv_usec;
+ return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
+ !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
}
-static inline suseconds_t timeval_delta(const struct timeval *large,
- const struct timeval *small)
+static inline int dccp_ack_pending(const struct sock *sk)
{
- time_t secs = large->tv_sec - small->tv_sec;
- suseconds_t usecs = large->tv_usec - small->tv_usec;
-
- if (usecs < 0) {
- secs--;
- usecs += USEC_PER_SEC;
- }
- return secs * USEC_PER_SEC + usecs;
+ return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
}
-static inline void timeval_add_usecs(struct timeval *tv,
- const suseconds_t usecs)
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
+int dccp_feat_finalise_settings(struct dccp_sock *dp);
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
+int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
+ struct sk_buff *skb);
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
+void dccp_feat_list_purge(struct list_head *fn_list);
+
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
+int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
+u32 dccp_timestamp(void);
+void dccp_timestamping_init(void);
+int dccp_insert_option(struct sk_buff *skb, unsigned char option,
+ const void *value, unsigned char len);
+
+#ifdef CONFIG_SYSCTL
+int dccp_sysctl_init(void);
+void dccp_sysctl_exit(void);
+#else
+static inline int dccp_sysctl_init(void)
{
- tv->tv_usec += usecs;
- while (tv->tv_usec >= USEC_PER_SEC) {
- tv->tv_sec++;
- tv->tv_usec -= USEC_PER_SEC;
- }
+ return 0;
}
-static inline void timeval_sub_usecs(struct timeval *tv,
- const suseconds_t usecs)
+static inline void dccp_sysctl_exit(void)
{
- tv->tv_usec -= usecs;
- while (tv->tv_usec < 0) {
- tv->tv_sec--;
- tv->tv_usec += USEC_PER_SEC;
- }
}
+#endif
#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
index 3f78c00e382..028fc43aacb 100644
--- a/net/dccp/diag.c
+++ b/net/dccp/diag.c
@@ -9,7 +9,6 @@
* published by the Free Software Foundation.
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/inet_diag.h>
@@ -30,11 +29,14 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_backoff = icsk->icsk_backoff;
info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
- if (dp->dccps_options.dccpo_send_ack_vector)
+ if (dp->dccps_hc_rx_ackvec != NULL)
info->tcpi_options |= TCPI_OPT_SACK;
- ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
- ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
+ if (dp->dccps_hc_rx_ccid != NULL)
+ ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
+
+ if (dp->dccps_hc_tx_ccid != NULL)
+ ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
}
static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -46,11 +48,23 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
dccp_get_info(sk, _info);
}
-static struct inet_diag_handler dccp_diag_handler = {
- .idiag_hashinfo = &dccp_hashinfo,
+static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+ inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
+}
+
+static int dccp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+ struct inet_diag_req_v2 *req)
+{
+ return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
+}
+
+static const struct inet_diag_handler dccp_diag_handler = {
+ .dump = dccp_diag_dump,
+ .dump_one = dccp_diag_dump_one,
.idiag_get_info = dccp_diag_get_info,
- .idiag_type = DCCPDIAG_GETSOCK,
- .idiag_info_size = sizeof(struct tcp_info),
+ .idiag_type = IPPROTO_DCCP,
};
static int __init dccp_diag_init(void)
@@ -69,3 +83,4 @@ module_exit(dccp_diag_fini);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
MODULE_DESCRIPTION("DCCP inet_diag handler");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
new file mode 100644
index 00000000000..9733ddbc96c
--- /dev/null
+++ b/net/dccp/feat.c
@@ -0,0 +1,1561 @@
+/*
+ * net/dccp/feat.c
+ *
+ * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ *
+ * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ * Rewrote from scratch, some bits from earlier code by
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ *
+ * ASSUMPTIONS
+ * -----------
+ * o Feature negotiation is coordinated with connection setup (as in TCP), wild
+ * changes of parameters of an established connection are not supported.
+ * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
+ * o All currently known SP features have 1-byte quantities. If in the future
+ * extensions of RFCs 4340..42 define features with item lengths larger than
+ * one byte, a feature-specific extension of the code will be required.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include "ccid.h"
+#include "feat.h"
+
+/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
+unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
+int sysctl_dccp_rx_ccid __read_mostly = 2,
+ sysctl_dccp_tx_ccid __read_mostly = 2;
+
+/*
+ * Feature activation handlers.
+ *
+ * These all use an u64 argument, to provide enough room for NN/SP features. At
+ * this stage the negotiated values have been checked to be within their range.
+ */
+static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct ccid *new_ccid = ccid_new(ccid, sk, rx);
+
+ if (new_ccid == NULL)
+ return -ENOMEM;
+
+ if (rx) {
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ dp->dccps_hc_rx_ccid = new_ccid;
+ } else {
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_tx_ccid = new_ccid;
+ }
+ return 0;
+}
+
+static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx) {
+ dp->dccps_r_seq_win = seq_win;
+ /* propagate changes to update SWL/SWH */
+ dccp_update_gsr(sk, dp->dccps_gsr);
+ } else {
+ dp->dccps_l_seq_win = seq_win;
+ /* propagate changes to update AWL */
+ dccp_update_gss(sk, dp->dccps_gss);
+ }
+ return 0;
+}
+
+static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
+{
+ if (rx)
+ dccp_sk(sk)->dccps_r_ack_ratio = ratio;
+ else
+ dccp_sk(sk)->dccps_l_ack_ratio = ratio;
+ return 0;
+}
+
+static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx) {
+ if (enable && dp->dccps_hc_rx_ackvec == NULL) {
+ dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
+ if (dp->dccps_hc_rx_ackvec == NULL)
+ return -ENOMEM;
+ } else if (!enable) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
+ }
+ return 0;
+}
+
+static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
+{
+ if (!rx)
+ dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
+ return 0;
+}
+
+/*
+ * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
+ * `rx' holds when the sending peer informs about his partial coverage via a
+ * ChangeR() option. In the other case, we are the sender and the receiver
+ * announces its coverage via ChangeL() options. The policy here is to honour
+ * such communication by enabling the corresponding partial coverage - but only
+ * if it has not been set manually before; the warning here means that all
+ * packets will be dropped.
+ */
+static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ if (rx)
+ dp->dccps_pcrlen = cscov;
+ else {
+ if (dp->dccps_pcslen == 0)
+ dp->dccps_pcslen = cscov;
+ else if (cscov > dp->dccps_pcslen)
+ DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
+ dp->dccps_pcslen, (u8)cscov);
+ }
+ return 0;
+}
+
+static const struct {
+ u8 feat_num; /* DCCPF_xxx */
+ enum dccp_feat_type rxtx; /* RX or TX */
+ enum dccp_feat_type reconciliation; /* SP or NN */
+ u8 default_value; /* as in 6.4 */
+ int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
+/*
+ * Lookup table for location and type of features (from RFC 4340/4342)
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ * | Feature | Location | Reconc. | Initial | Section |
+ * | | RX | TX | SP | NN | Value | Reference |
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ * | DCCPF_CCID | | X | X | | 2 | 10 |
+ * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
+ * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
+ * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
+ * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
+ * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
+ * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
+ * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
+ * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
+ * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
+ * +--------------------------+----+-----+----+----+---------+-----------+
+ */
+} dccp_feat_table[] = {
+ { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
+ { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
+ { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
+ { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
+ { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
+ { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
+ { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
+ { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
+ { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
+ { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
+};
+#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
+
+/**
+ * dccp_feat_index - Hash function to map feature number into array position
+ * Returns consecutive array index or -1 if the feature is not understood.
+ */
+static int dccp_feat_index(u8 feat_num)
+{
+ /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
+ if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
+ return feat_num - 1;
+
+ /*
+ * Other features: add cases for new feature types here after adding
+ * them to the above table.
+ */
+ switch (feat_num) {
+ case DCCPF_SEND_LEV_RATE:
+ return DCCP_FEAT_SUPPORTED_MAX - 1;
+ }
+ return -1;
+}
+
+static u8 dccp_feat_type(u8 feat_num)
+{
+ int idx = dccp_feat_index(feat_num);
+
+ if (idx < 0)
+ return FEAT_UNKNOWN;
+ return dccp_feat_table[idx].reconciliation;
+}
+
+static int dccp_feat_default_value(u8 feat_num)
+{
+ int idx = dccp_feat_index(feat_num);
+ /*
+ * There are no default values for unknown features, so encountering a
+ * negative index here indicates a serious problem somewhere else.
+ */
+ DCCP_BUG_ON(idx < 0);
+
+ return idx < 0 ? 0 : dccp_feat_table[idx].default_value;
+}
+
+/*
+ * Debugging and verbose-printing section
+ */
+static const char *dccp_feat_fname(const u8 feat)
+{
+ static const char *const feature_names[] = {
+ [DCCPF_RESERVED] = "Reserved",
+ [DCCPF_CCID] = "CCID",
+ [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
+ [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
+ [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
+ [DCCPF_ACK_RATIO] = "Ack Ratio",
+ [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
+ [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
+ [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
+ [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
+ };
+ if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
+ return feature_names[DCCPF_RESERVED];
+
+ if (feat == DCCPF_SEND_LEV_RATE)
+ return "Send Loss Event Rate";
+ if (feat >= DCCPF_MIN_CCID_SPECIFIC)
+ return "CCID-specific";
+
+ return feature_names[feat];
+}
+
+static const char *const dccp_feat_sname[] = {
+ "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE",
+};
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_feat_oname(const u8 opt)
+{
+ switch (opt) {
+ case DCCPO_CHANGE_L: return "Change_L";
+ case DCCPO_CONFIRM_L: return "Confirm_L";
+ case DCCPO_CHANGE_R: return "Change_R";
+ case DCCPO_CONFIRM_R: return "Confirm_R";
+ }
+ return NULL;
+}
+
+static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
+{
+ u8 i, type = dccp_feat_type(feat_num);
+
+ if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
+ dccp_pr_debug_cat("(NULL)");
+ else if (type == FEAT_SP)
+ for (i = 0; i < val->sp.len; i++)
+ dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
+ else if (type == FEAT_NN)
+ dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
+ else
+ dccp_pr_debug_cat("unknown type %u", type);
+}
+
+static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
+{
+ u8 type = dccp_feat_type(feat_num);
+ dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
+
+ if (type == FEAT_NN)
+ fval.nn = dccp_decode_value_var(list, len);
+ dccp_feat_printval(feat_num, &fval);
+}
+
+static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
+{
+ dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote",
+ dccp_feat_fname(entry->feat_num));
+ dccp_feat_printval(entry->feat_num, &entry->val);
+ dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
+ entry->needs_confirm ? "(Confirm pending)" : "");
+}
+
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \
+ dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
+ dccp_feat_printvals(feat, val, len); \
+ dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
+
+#define dccp_feat_print_fnlist(fn_list) { \
+ const struct dccp_feat_entry *___entry; \
+ \
+ dccp_pr_debug("List Dump:\n"); \
+ list_for_each_entry(___entry, fn_list, node) \
+ dccp_feat_print_entry(___entry); \
+}
+#else /* ! CONFIG_IP_DCCP_DEBUG */
+#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
+#define dccp_feat_print_fnlist(fn_list)
+#endif
+
+static int __dccp_feat_activate(struct sock *sk, const int idx,
+ const bool is_local, dccp_feat_val const *fval)
+{
+ bool rx;
+ u64 val;
+
+ if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
+ return -1;
+ if (dccp_feat_table[idx].activation_hdlr == NULL)
+ return 0;
+
+ if (fval == NULL) {
+ val = dccp_feat_table[idx].default_value;
+ } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
+ if (fval->sp.vec == NULL) {
+ /*
+ * This can happen when an empty Confirm is sent
+ * for an SP (i.e. known) feature. In this case
+ * we would be using the default anyway.
+ */
+ DCCP_CRIT("Feature #%d undefined: using default", idx);
+ val = dccp_feat_table[idx].default_value;
+ } else {
+ val = fval->sp.vec[0];
+ }
+ } else {
+ val = fval->nn;
+ }
+
+ /* Location is RX if this is a local-RX or remote-TX feature */
+ rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
+
+ dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
+ dccp_feat_fname(dccp_feat_table[idx].feat_num),
+ fval ? "" : "default ", (unsigned long long)val);
+
+ return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
+}
+
+/**
+ * dccp_feat_activate - Activate feature value on socket
+ * @sk: fully connected DCCP socket (after handshake is complete)
+ * @feat_num: feature to activate, one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @fval: the value (SP or NN) to activate, or NULL to use the default value
+ *
+ * For general use this function is preferable over __dccp_feat_activate().
+ */
+static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
+ dccp_feat_val const *fval)
+{
+ return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
+}
+
+/* Test for "Req'd" feature (RFC 4340, 6.4) */
+static inline int dccp_feat_must_be_understood(u8 feat_num)
+{
+ return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
+ feat_num == DCCPF_SEQUENCE_WINDOW;
+}
+
+/* copy constructor, fval must not already contain allocated memory */
+static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
+{
+ fval->sp.len = len;
+ if (fval->sp.len > 0) {
+ fval->sp.vec = kmemdup(val, len, gfp_any());
+ if (fval->sp.vec == NULL) {
+ fval->sp.len = 0;
+ return -ENOBUFS;
+ }
+ }
+ return 0;
+}
+
+static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
+{
+ if (unlikely(val == NULL))
+ return;
+ if (dccp_feat_type(feat_num) == FEAT_SP)
+ kfree(val->sp.vec);
+ memset(val, 0, sizeof(*val));
+}
+
+static struct dccp_feat_entry *
+ dccp_feat_clone_entry(struct dccp_feat_entry const *original)
+{
+ struct dccp_feat_entry *new;
+ u8 type = dccp_feat_type(original->feat_num);
+
+ if (type == FEAT_UNKNOWN)
+ return NULL;
+
+ new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
+ if (new == NULL)
+ return NULL;
+
+ if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
+ original->val.sp.vec,
+ original->val.sp.len)) {
+ kfree(new);
+ return NULL;
+ }
+ return new;
+}
+
+static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
+{
+ if (entry != NULL) {
+ dccp_feat_val_destructor(entry->feat_num, &entry->val);
+ kfree(entry);
+ }
+}
+
+/*
+ * List management functions
+ *
+ * Feature negotiation lists rely on and maintain the following invariants:
+ * - each feat_num in the list is known, i.e. we know its type and default value
+ * - each feat_num/is_local combination is unique (old entries are overwritten)
+ * - SP values are always freshly allocated
+ * - list is sorted in increasing order of feature number (faster lookup)
+ */
+static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
+ u8 feat_num, bool is_local)
+{
+ struct dccp_feat_entry *entry;
+
+ list_for_each_entry(entry, fn_list, node) {
+ if (entry->feat_num == feat_num && entry->is_local == is_local)
+ return entry;
+ else if (entry->feat_num > feat_num)
+ break;
+ }
+ return NULL;
+}
+
+/**
+ * dccp_feat_entry_new - Central list update routine (called by all others)
+ * @head: list to add to
+ * @feat: feature number
+ * @local: whether the local (1) or remote feature with number @feat is meant
+ *
+ * This is the only constructor and serves to ensure the above invariants.
+ */
+static struct dccp_feat_entry *
+ dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
+{
+ struct dccp_feat_entry *entry;
+
+ list_for_each_entry(entry, head, node)
+ if (entry->feat_num == feat && entry->is_local == local) {
+ dccp_feat_val_destructor(entry->feat_num, &entry->val);
+ return entry;
+ } else if (entry->feat_num > feat) {
+ head = &entry->node;
+ break;
+ }
+
+ entry = kmalloc(sizeof(*entry), gfp_any());
+ if (entry != NULL) {
+ entry->feat_num = feat;
+ entry->is_local = local;
+ list_add_tail(&entry->node, head);
+ }
+ return entry;
+}
+
+/**
+ * dccp_feat_push_change - Add/overwrite a Change option in the list
+ * @fn_list: feature-negotiation list to update
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is meant
+ * @needs_mandatory: whether to use Mandatory feature negotiation options
+ * @fval: pointer to NN/SP value to be inserted (will be copied)
+ */
+static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
+ u8 mandatory, dccp_feat_val *fval)
+{
+ struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+ if (new == NULL)
+ return -ENOMEM;
+
+ new->feat_num = feat;
+ new->is_local = local;
+ new->state = FEAT_INITIALISING;
+ new->needs_confirm = false;
+ new->empty_confirm = false;
+ new->val = *fval;
+ new->needs_mandatory = mandatory;
+
+ return 0;
+}
+
+/**
+ * dccp_feat_push_confirm - Add a Confirm entry to the FN list
+ * @fn_list: feature-negotiation list to add to
+ * @feat: one of %dccp_feature_numbers
+ * @local: whether local (1) or remote (0) @feat_num is being confirmed
+ * @fval: pointer to NN/SP value to be inserted or NULL
+ *
+ * Returns 0 on success, a Reset code for further processing otherwise.
+ */
+static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
+ dccp_feat_val *fval)
+{
+ struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
+
+ if (new == NULL)
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ new->feat_num = feat;
+ new->is_local = local;
+ new->state = FEAT_STABLE; /* transition in 6.6.2 */
+ new->needs_confirm = true;
+ new->empty_confirm = (fval == NULL);
+ new->val.nn = 0; /* zeroes the whole structure */
+ if (!new->empty_confirm)
+ new->val = *fval;
+ new->needs_mandatory = false;
+
+ return 0;
+}
+
+static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
+{
+ return dccp_feat_push_confirm(fn_list, feat, local, NULL);
+}
+
+static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
+{
+ list_del(&entry->node);
+ dccp_feat_entry_destructor(entry);
+}
+
+void dccp_feat_list_purge(struct list_head *fn_list)
+{
+ struct dccp_feat_entry *entry, *next;
+
+ list_for_each_entry_safe(entry, next, fn_list, node)
+ dccp_feat_entry_destructor(entry);
+ INIT_LIST_HEAD(fn_list);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
+
+/* generate @to as full clone of @from - @to must not contain any nodes */
+int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
+{
+ struct dccp_feat_entry *entry, *new;
+
+ INIT_LIST_HEAD(to);
+ list_for_each_entry(entry, from, node) {
+ new = dccp_feat_clone_entry(entry);
+ if (new == NULL)
+ goto cloning_failed;
+ list_add_tail(&new->node, to);
+ }
+ return 0;
+
+cloning_failed:
+ dccp_feat_list_purge(to);
+ return -ENOMEM;
+}
+
+/**
+ * dccp_feat_valid_nn_length - Enforce length constraints on NN options
+ * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
+ * incoming options are accepted as long as their values are valid.
+ */
+static u8 dccp_feat_valid_nn_length(u8 feat_num)
+{
+ if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
+ return 2;
+ if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
+ return 6;
+ return 0;
+}
+
+static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
+{
+ switch (feat_num) {
+ case DCCPF_ACK_RATIO:
+ return val <= DCCPF_ACK_RATIO_MAX;
+ case DCCPF_SEQUENCE_WINDOW:
+ return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
+ }
+ return 0; /* feature unknown - so we can't tell */
+}
+
+/* check that SP values are within the ranges defined in RFC 4340 */
+static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
+{
+ switch (feat_num) {
+ case DCCPF_CCID:
+ return val == DCCPC_CCID2 || val == DCCPC_CCID3;
+ /* Type-check Boolean feature values: */
+ case DCCPF_SHORT_SEQNOS:
+ case DCCPF_ECN_INCAPABLE:
+ case DCCPF_SEND_ACK_VECTOR:
+ case DCCPF_SEND_NDP_COUNT:
+ case DCCPF_DATA_CHECKSUM:
+ case DCCPF_SEND_LEV_RATE:
+ return val < 2;
+ case DCCPF_MIN_CSUM_COVER:
+ return val < 16;
+ }
+ return 0; /* feature unknown */
+}
+
+static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
+{
+ if (sp_list == NULL || sp_len < 1)
+ return 0;
+ while (sp_len--)
+ if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
+ return 0;
+ return 1;
+}
+
+/**
+ * dccp_feat_insert_opts - Generate FN options from current list state
+ * @skb: next sk_buff to be sent to the peer
+ * @dp: for client during handshake and general negotiation
+ * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
+ */
+int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
+{
+ struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+ struct dccp_feat_entry *pos, *next;
+ u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
+ bool rpt;
+
+ /* put entries into @skb in the order they appear in the list */
+ list_for_each_entry_safe_reverse(pos, next, fn, node) {
+ opt = dccp_feat_genopt(pos);
+ type = dccp_feat_type(pos->feat_num);
+ rpt = false;
+
+ if (pos->empty_confirm) {
+ len = 0;
+ ptr = NULL;
+ } else {
+ if (type == FEAT_SP) {
+ len = pos->val.sp.len;
+ ptr = pos->val.sp.vec;
+ rpt = pos->needs_confirm;
+ } else if (type == FEAT_NN) {
+ len = dccp_feat_valid_nn_length(pos->feat_num);
+ ptr = nn_in_nbo;
+ dccp_encode_value_var(pos->val.nn, ptr, len);
+ } else {
+ DCCP_BUG("unknown feature %u", pos->feat_num);
+ return -1;
+ }
+ }
+ dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
+
+ if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
+ return -1;
+ if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
+ return -1;
+
+ if (skb->sk->sk_state == DCCP_OPEN &&
+ (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
+ /*
+ * Confirms don't get retransmitted (6.6.3) once the
+ * connection is in state OPEN
+ */
+ dccp_feat_list_pop(pos);
+ } else {
+ /*
+ * Enter CHANGING after transmitting the Change
+ * option (6.6.2).
+ */
+ if (pos->state == FEAT_INITIALISING)
+ pos->state = FEAT_CHANGING;
+ }
+ }
+ return 0;
+}
+
+/**
+ * __feat_register_nn - Register new NN value on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an NN feature from %dccp_feature_numbers
+ * @mandatory: use Mandatory option if 1
+ * @nn_val: value to register (restricted to 4 bytes)
+ *
+ * Note that NN features are local by definition (RFC 4340, 6.3.2).
+ */
+static int __feat_register_nn(struct list_head *fn, u8 feat,
+ u8 mandatory, u64 nn_val)
+{
+ dccp_feat_val fval = { .nn = nn_val };
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ /* Don't bother with default values, they will be activated anyway. */
+ if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
+ return 0;
+
+ return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
+}
+
+/**
+ * __feat_register_sp - Register new SP value/list on socket
+ * @fn: feature-negotiation list to register with
+ * @feat: an SP feature from %dccp_feature_numbers
+ * @is_local: whether the local (1) or the remote (0) @feat is meant
+ * @mandatory: use Mandatory option if 1
+ * @sp_val: SP value followed by optional preference list
+ * @sp_len: length of @sp_val in bytes
+ */
+static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
+ u8 mandatory, u8 const *sp_val, u8 sp_len)
+{
+ dccp_feat_val fval;
+
+ if (dccp_feat_type(feat) != FEAT_SP ||
+ !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
+ return -EINVAL;
+
+ /* Avoid negotiating alien CCIDs by only advertising supported ones */
+ if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
+ return -EOPNOTSUPP;
+
+ if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
+ return -ENOMEM;
+
+ return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
+}
+
+/**
+ * dccp_feat_register_sp - Register requests to change SP feature values
+ * @sk: client or listening socket
+ * @feat: one of %dccp_feature_numbers
+ * @is_local: whether the local (1) or remote (0) @feat is meant
+ * @list: array of preferred values, in descending order of preference
+ * @len: length of @list in bytes
+ */
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len)
+{ /* any changes must be registered before establishing the connection */
+ if (sk->sk_state != DCCP_CLOSED)
+ return -EISCONN;
+ if (dccp_feat_type(feat) != FEAT_SP)
+ return -EINVAL;
+ return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
+ 0, list, len);
+}
+
+/**
+ * dccp_feat_nn_get - Query current/pending value of NN feature
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ *
+ * For a known NN feature, returns value currently being negotiated, or
+ * current (confirmed) value if no negotiation is going on.
+ */
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
+{
+ if (dccp_feat_type(feat) == FEAT_NN) {
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *entry;
+
+ entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
+ if (entry != NULL)
+ return entry->val.nn;
+
+ switch (feat) {
+ case DCCPF_ACK_RATIO:
+ return dp->dccps_l_ack_ratio;
+ case DCCPF_SEQUENCE_WINDOW:
+ return dp->dccps_l_seq_win;
+ }
+ }
+ DCCP_BUG("attempt to look up unsupported feature %u", feat);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
+
+/**
+ * dccp_feat_signal_nn_change - Update NN values for an established connection
+ * @sk: DCCP socket of an established connection
+ * @feat: NN feature number from %dccp_feature_numbers
+ * @nn_val: the new value to use
+ *
+ * This function is used to communicate NN updates out-of-band.
+ */
+int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ dccp_feat_val fval = { .nn = nn_val };
+ struct dccp_feat_entry *entry;
+
+ if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
+ return 0;
+
+ if (dccp_feat_type(feat) != FEAT_NN ||
+ !dccp_feat_is_valid_nn_val(feat, nn_val))
+ return -EINVAL;
+
+ if (nn_val == dccp_feat_nn_get(sk, feat))
+ return 0; /* already set or negotiation under way */
+
+ entry = dccp_feat_list_lookup(fn, feat, 1);
+ if (entry != NULL) {
+ dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
+ (unsigned long long)entry->val.nn,
+ (unsigned long long)nn_val);
+ dccp_feat_list_pop(entry);
+ }
+
+ inet_csk_schedule_ack(sk);
+ return dccp_feat_push_change(fn, feat, 1, 0, &fval);
+}
+EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
+
+/*
+ * Tracking features whose value depend on the choice of CCID
+ *
+ * This is designed with an extension in mind so that a list walk could be done
+ * before activating any features. However, the existing framework was found to
+ * work satisfactorily up until now, the automatic verification is left open.
+ * When adding new CCIDs, add a corresponding dependency table here.
+ */
+static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
+{
+ static const struct ccid_dependency ccid2_dependencies[2][2] = {
+ /*
+ * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
+ * feature and Send Ack Vector is an RX feature, `is_local'
+ * needs to be reversed.
+ */
+ { /* Dependencies of the receiver-side (remote) CCID2 */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = true,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ },
+ { /* Dependencies of the sender-side (local) CCID2 */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ }
+ };
+ static const struct ccid_dependency ccid3_dependencies[2][5] = {
+ { /*
+ * Dependencies of the receiver-side CCID3
+ */
+ { /* locally disable Ack Vectors */
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 0
+ },
+ { /* see below why Send Loss Event Rate is on */
+ .dependent_feat = DCCPF_SEND_LEV_RATE,
+ .is_local = true,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { /* NDP Count is needed as per RFC 4342, 6.1.1 */
+ .dependent_feat = DCCPF_SEND_NDP_COUNT,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { 0, 0, 0, 0 },
+ },
+ { /*
+ * CCID3 at the TX side: we request that the HC-receiver
+ * will not send Ack Vectors (they will be ignored, so
+ * Mandatory is not set); we enable Send Loss Event Rate
+ * (Mandatory since the implementation does not support
+ * the Loss Intervals option of RFC 4342, 8.6).
+ * The last two options are for peer's information only.
+ */
+ {
+ .dependent_feat = DCCPF_SEND_ACK_VECTOR,
+ .is_local = false,
+ .is_mandatory = false,
+ .val = 0
+ },
+ {
+ .dependent_feat = DCCPF_SEND_LEV_RATE,
+ .is_local = false,
+ .is_mandatory = true,
+ .val = 1
+ },
+ { /* this CCID does not support Ack Ratio */
+ .dependent_feat = DCCPF_ACK_RATIO,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 0
+ },
+ { /* tell receiver we are sending NDP counts */
+ .dependent_feat = DCCPF_SEND_NDP_COUNT,
+ .is_local = true,
+ .is_mandatory = false,
+ .val = 1
+ },
+ { 0, 0, 0, 0 }
+ }
+ };
+ switch (ccid) {
+ case DCCPC_CCID2:
+ return ccid2_dependencies[is_local];
+ case DCCPC_CCID3:
+ return ccid3_dependencies[is_local];
+ default:
+ return NULL;
+ }
+}
+
+/**
+ * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
+ * @fn: feature-negotiation list to update
+ * @id: CCID number to track
+ * @is_local: whether TX CCID (1) or RX CCID (0) is meant
+ *
+ * This function needs to be called after registering all other features.
+ */
+static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
+{
+ const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
+ int i, rc = (table == NULL);
+
+ for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
+ if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
+ rc = __feat_register_sp(fn, table[i].dependent_feat,
+ table[i].is_local,
+ table[i].is_mandatory,
+ &table[i].val, 1);
+ else
+ rc = __feat_register_nn(fn, table[i].dependent_feat,
+ table[i].is_mandatory,
+ table[i].val);
+ return rc;
+}
+
+/**
+ * dccp_feat_finalise_settings - Finalise settings before starting negotiation
+ * @dp: client or listening socket (settings will be inherited)
+ *
+ * This is called after all registrations (socket initialisation, sysctls, and
+ * sockopt calls), and before sending the first packet containing Change options
+ * (ie. client-Request or server-Response), to ensure internal consistency.
+ */
+int dccp_feat_finalise_settings(struct dccp_sock *dp)
+{
+ struct list_head *fn = &dp->dccps_featneg;
+ struct dccp_feat_entry *entry;
+ int i = 2, ccids[2] = { -1, -1 };
+
+ /*
+ * Propagating CCIDs:
+ * 1) not useful to propagate CCID settings if this host advertises more
+ * than one CCID: the choice of CCID may still change - if this is
+ * the client, or if this is the server and the client sends
+ * singleton CCID values.
+ * 2) since is that propagate_ccid changes the list, we defer changing
+ * the sorted list until after the traversal.
+ */
+ list_for_each_entry(entry, fn, node)
+ if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
+ ccids[entry->is_local] = entry->val.sp.vec[0];
+ while (i--)
+ if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
+ return -1;
+ dccp_feat_print_fnlist(fn);
+ return 0;
+}
+
+/**
+ * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
+ * It is the server which resolves the dependencies once the CCID has been
+ * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
+ */
+int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
+{
+ struct list_head *fn = &dreq->dreq_featneg;
+ struct dccp_feat_entry *entry;
+ u8 is_local, ccid;
+
+ for (is_local = 0; is_local <= 1; is_local++) {
+ entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
+
+ if (entry != NULL && !entry->empty_confirm)
+ ccid = entry->val.sp.vec[0];
+ else
+ ccid = dccp_feat_default_value(DCCPF_CCID);
+
+ if (dccp_feat_propagate_ccid(fn, ccid, is_local))
+ return -1;
+ }
+ return 0;
+}
+
+/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
+static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
+{
+ u8 c, s;
+
+ for (s = 0; s < slen; s++)
+ for (c = 0; c < clen; c++)
+ if (servlist[s] == clilist[c])
+ return servlist[s];
+ return -1;
+}
+
+/**
+ * dccp_feat_prefer - Move preferred entry to the start of array
+ * Reorder the @array_len elements in @array so that @preferred_value comes
+ * first. Returns >0 to indicate that @preferred_value does occur in @array.
+ */
+static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
+{
+ u8 i, does_occur = 0;
+
+ if (array != NULL) {
+ for (i = 0; i < array_len; i++)
+ if (array[i] == preferred_value) {
+ array[i] = array[0];
+ does_occur++;
+ }
+ if (does_occur)
+ array[0] = preferred_value;
+ }
+ return does_occur;
+}
+
+/**
+ * dccp_feat_reconcile - Reconcile SP preference lists
+ * @fval: SP list to reconcile into
+ * @arr: received SP preference list
+ * @len: length of @arr in bytes
+ * @is_server: whether this side is the server (and @fv is the server's list)
+ * @reorder: whether to reorder the list in @fv after reconciling with @arr
+ * When successful, > 0 is returned and the reconciled list is in @fval.
+ * A value of 0 means that negotiation failed (no shared entry).
+ */
+static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
+ bool is_server, bool reorder)
+{
+ int rc;
+
+ if (!fv->sp.vec || !arr) {
+ DCCP_CRIT("NULL feature value or array");
+ return 0;
+ }
+
+ if (is_server)
+ rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
+ else
+ rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
+
+ if (!reorder)
+ return rc;
+ if (rc < 0)
+ return 0;
+
+ /*
+ * Reorder list: used for activating features and in dccp_insert_fn_opt.
+ */
+ return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
+}
+
+/**
+ * dccp_feat_change_recv - Process incoming ChangeL/R options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether the Change was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is the server (1) or the client (0)
+ */
+static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len, const bool server)
+{
+ u8 defval, type = dccp_feat_type(feat);
+ const bool local = (opt == DCCPO_CHANGE_R);
+ struct dccp_feat_entry *entry;
+ dccp_feat_val fval;
+
+ if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
+ goto unknown_feature_or_value;
+
+ dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+ /*
+ * Negotiation of NN features: Change R is invalid, so there is no
+ * simultaneous negotiation; hence we do not look up in the list.
+ */
+ if (type == FEAT_NN) {
+ if (local || len > sizeof(fval.nn))
+ goto unknown_feature_or_value;
+
+ /* 6.3.2: "The feature remote MUST accept any valid value..." */
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto unknown_feature_or_value;
+
+ return dccp_feat_push_confirm(fn, feat, local, &fval);
+ }
+
+ /*
+ * Unidirectional/simultaneous negotiation of SP features (6.3.1)
+ */
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL) {
+ /*
+ * No particular preferences have been registered. We deal with
+ * this situation by assuming that all valid values are equally
+ * acceptable, and apply the following checks:
+ * - if the peer's list is a singleton, we accept a valid value;
+ * - if we are the server, we first try to see if the peer (the
+ * client) advertises the default value. If yes, we use it,
+ * otherwise we accept the preferred value;
+ * - else if we are the client, we use the first list element.
+ */
+ if (dccp_feat_clone_sp_val(&fval, val, 1))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ if (len > 1 && server) {
+ defval = dccp_feat_default_value(feat);
+ if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
+ fval.sp.vec[0] = defval;
+ } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
+ kfree(fval.sp.vec);
+ goto unknown_feature_or_value;
+ }
+
+ /* Treat unsupported CCIDs like invalid values */
+ if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
+ kfree(fval.sp.vec);
+ goto not_valid_or_not_known;
+ }
+
+ return dccp_feat_push_confirm(fn, feat, local, &fval);
+
+ } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
+ return 0;
+ }
+
+ if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
+ entry->empty_confirm = false;
+ } else if (is_mandatory) {
+ return DCCP_RESET_CODE_MANDATORY_ERROR;
+ } else if (entry->state == FEAT_INITIALISING) {
+ /*
+ * Failed simultaneous negotiation (server only): try to `save'
+ * the connection by checking whether entry contains the default
+ * value for @feat. If yes, send an empty Confirm to signal that
+ * the received Change was not understood - which implies using
+ * the default value.
+ * If this also fails, we use Reset as the last resort.
+ */
+ WARN_ON(!server);
+ defval = dccp_feat_default_value(feat);
+ if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
+ return DCCP_RESET_CODE_OPTION_ERROR;
+ entry->empty_confirm = true;
+ }
+ entry->needs_confirm = true;
+ entry->needs_mandatory = false;
+ entry->state = FEAT_STABLE;
+ return 0;
+
+unknown_feature_or_value:
+ if (!is_mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
+
+not_valid_or_not_known:
+ return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_confirm_recv - Process received Confirm options
+ * @fn: feature-negotiation list to update
+ * @is_mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: NN value or SP value/preference list
+ * @len: length of @val in bytes
+ * @server: whether this node is server (1) or client (0)
+ */
+static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len, const bool server)
+{
+ u8 *plist, plen, type = dccp_feat_type(feat);
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
+
+ dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
+
+ if (entry == NULL) { /* nothing queued: ignore or handle error */
+ if (is_mandatory && type == FEAT_UNKNOWN)
+ return DCCP_RESET_CODE_MANDATORY_ERROR;
+
+ if (!local && type == FEAT_NN) /* 6.3.2 */
+ goto confirmation_failed;
+ return 0;
+ }
+
+ if (entry->state != FEAT_CHANGING) /* 6.6.2 */
+ return 0;
+
+ if (len == 0) {
+ if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
+ goto confirmation_failed;
+ /*
+ * Empty Confirm during connection setup: this means reverting
+ * to the `old' value, which in this case is the default. Since
+ * we handle default values automatically when no other values
+ * have been set, we revert to the old value by removing this
+ * entry from the list.
+ */
+ dccp_feat_list_pop(entry);
+ return 0;
+ }
+
+ if (type == FEAT_NN) {
+ if (len > sizeof(entry->val.nn))
+ goto confirmation_failed;
+
+ if (entry->val.nn == dccp_decode_value_var(val, len))
+ goto confirmation_succeeded;
+
+ DCCP_WARN("Bogus Confirm for non-existing value\n");
+ goto confirmation_failed;
+ }
+
+ /*
+ * Parsing SP Confirms: the first element of @val is the preferred
+ * SP value which the peer confirms, the remainder depends on @len.
+ * Note that only the confirmed value need to be a valid SP value.
+ */
+ if (!dccp_feat_is_valid_sp_val(feat, *val))
+ goto confirmation_failed;
+
+ if (len == 1) { /* peer didn't supply a preference list */
+ plist = val;
+ plen = len;
+ } else { /* preferred value + preference list */
+ plist = val + 1;
+ plen = len - 1;
+ }
+
+ /* Check whether the peer got the reconciliation right (6.6.8) */
+ if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
+ DCCP_WARN("Confirm selected the wrong value %u\n", *val);
+ return DCCP_RESET_CODE_OPTION_ERROR;
+ }
+ entry->val.sp.vec[0] = *val;
+
+confirmation_succeeded:
+ entry->state = FEAT_STABLE;
+ return 0;
+
+confirmation_failed:
+ DCCP_WARN("Confirmation failed\n");
+ return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_handle_nn_established - Fast-path reception of NN options
+ * @sk: socket of an established DCCP connection
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
+ * @feat: NN number, one of %dccp_feature_numbers
+ * @val: NN value
+ * @len: length of @val in bytes
+ *
+ * This function combines the functionality of change_recv/confirm_recv, with
+ * the following differences (reset codes are the same):
+ * - cleanup after receiving the Confirm;
+ * - values are directly activated after successful parsing;
+ * - deliberately restricted to NN features.
+ * The restriction to NN features is essential since SP features can have non-
+ * predictable outcomes (depending on the remote configuration), and are inter-
+ * dependent (CCIDs for instance cause further dependencies).
+ */
+static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
+ u8 feat, u8 *val, u8 len)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ const bool local = (opt == DCCPO_CONFIRM_R);
+ struct dccp_feat_entry *entry;
+ u8 type = dccp_feat_type(feat);
+ dccp_feat_val fval;
+
+ dccp_feat_print_opt(opt, feat, val, len, mandatory);
+
+ /* Ignore non-mandatory unknown and non-NN features */
+ if (type == FEAT_UNKNOWN) {
+ if (local && !mandatory)
+ return 0;
+ goto fast_path_unknown;
+ } else if (type != FEAT_NN) {
+ return 0;
+ }
+
+ /*
+ * We don't accept empty Confirms, since in fast-path feature
+ * negotiation the values are enabled immediately after sending
+ * the Change option.
+ * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
+ */
+ if (len == 0 || len > sizeof(fval.nn))
+ goto fast_path_unknown;
+
+ if (opt == DCCPO_CHANGE_L) {
+ fval.nn = dccp_decode_value_var(val, len);
+ if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
+ goto fast_path_unknown;
+
+ if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
+ dccp_feat_activate(sk, feat, local, &fval))
+ return DCCP_RESET_CODE_TOO_BUSY;
+
+ /* set the `Ack Pending' flag to piggyback a Confirm */
+ inet_csk_schedule_ack(sk);
+
+ } else if (opt == DCCPO_CONFIRM_R) {
+ entry = dccp_feat_list_lookup(fn, feat, local);
+ if (entry == NULL || entry->state != FEAT_CHANGING)
+ return 0;
+
+ fval.nn = dccp_decode_value_var(val, len);
+ /*
+ * Just ignore a value that doesn't match our current value.
+ * If the option changes twice within two RTTs, then at least
+ * one CONFIRM will be received for the old value after a
+ * new CHANGE was sent.
+ */
+ if (fval.nn != entry->val.nn)
+ return 0;
+
+ /* Only activate after receiving the Confirm option (6.6.1). */
+ dccp_feat_activate(sk, feat, local, &fval);
+
+ /* It has been confirmed - so remove the entry */
+ dccp_feat_list_pop(entry);
+
+ } else {
+ DCCP_WARN("Received illegal option %u\n", opt);
+ goto fast_path_failed;
+ }
+ return 0;
+
+fast_path_unknown:
+ if (!mandatory)
+ return dccp_push_empty_confirm(fn, feat, local);
+
+fast_path_failed:
+ return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
+ : DCCP_RESET_CODE_OPTION_ERROR;
+}
+
+/**
+ * dccp_feat_parse_options - Process Feature-Negotiation Options
+ * @sk: for general use and used by the client during connection setup
+ * @dreq: used by the server during connection setup
+ * @mandatory: whether @opt was preceded by a Mandatory option
+ * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
+ * @feat: one of %dccp_feature_numbers
+ * @val: value contents of @opt
+ * @len: length of @val in bytes
+ *
+ * Returns 0 on success, a Reset code for ending the connection otherwise.
+ */
+int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+ u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
+ bool server = false;
+
+ switch (sk->sk_state) {
+ /*
+ * Negotiation during connection setup
+ */
+ case DCCP_LISTEN:
+ server = true; /* fall through */
+ case DCCP_REQUESTING:
+ switch (opt) {
+ case DCCPO_CHANGE_L:
+ case DCCPO_CHANGE_R:
+ return dccp_feat_change_recv(fn, mandatory, opt, feat,
+ val, len, server);
+ case DCCPO_CONFIRM_R:
+ case DCCPO_CONFIRM_L:
+ return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
+ val, len, server);
+ }
+ break;
+ /*
+ * Support for exchanging NN options on an established connection.
+ */
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
+ val, len);
+ }
+ return 0; /* ignore FN options in all other states */
+}
+
+/**
+ * dccp_feat_init - Seed feature negotiation with host-specific defaults
+ * This initialises global defaults, depending on the value of the sysctls.
+ * These can later be overridden by registering changes via setsockopt calls.
+ * The last link in the chain is finalise_settings, to make sure that between
+ * here and the start of actual feature negotiation no inconsistencies enter.
+ *
+ * All features not appearing below use either defaults or are otherwise
+ * later adjusted through dccp_feat_finalise_settings().
+ */
+int dccp_feat_init(struct sock *sk)
+{
+ struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
+ u8 on = 1, off = 0;
+ int rc;
+ struct {
+ u8 *val;
+ u8 len;
+ } tx, rx;
+
+ /* Non-negotiable (NN) features */
+ rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
+ sysctl_dccp_sequence_window);
+ if (rc)
+ return rc;
+
+ /* Server-priority (SP) features */
+
+ /* Advertise that short seqnos are not supported (7.6.1) */
+ rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
+ if (rc)
+ return rc;
+
+ /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
+ rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
+ if (rc)
+ return rc;
+
+ /*
+ * We advertise the available list of CCIDs and reorder according to
+ * preferences, to avoid failure resulting from negotiating different
+ * singleton values (which always leads to failure).
+ * These settings can still (later) be overridden via sockopts.
+ */
+ if (ccid_get_builtin_ccids(&tx.val, &tx.len) ||
+ ccid_get_builtin_ccids(&rx.val, &rx.len))
+ return -ENOBUFS;
+
+ if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
+ !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
+ goto free_ccid_lists;
+
+ rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
+ if (rc)
+ goto free_ccid_lists;
+
+ rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
+
+free_ccid_lists:
+ kfree(tx.val);
+ kfree(rx.val);
+ return rc;
+}
+
+int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_feat_entry *cur, *next;
+ int idx;
+ dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
+ [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
+ };
+
+ list_for_each_entry(cur, fn_list, node) {
+ /*
+ * An empty Confirm means that either an unknown feature type
+ * or an invalid value was present. In the first case there is
+ * nothing to activate, in the other the default value is used.
+ */
+ if (cur->empty_confirm)
+ continue;
+
+ idx = dccp_feat_index(cur->feat_num);
+ if (idx < 0) {
+ DCCP_BUG("Unknown feature %u", cur->feat_num);
+ goto activation_failed;
+ }
+ if (cur->state != FEAT_STABLE) {
+ DCCP_CRIT("Negotiation of %s %s failed in state %s",
+ cur->is_local ? "local" : "remote",
+ dccp_feat_fname(cur->feat_num),
+ dccp_feat_sname[cur->state]);
+ goto activation_failed;
+ }
+ fvals[idx][cur->is_local] = &cur->val;
+ }
+
+ /*
+ * Activate in decreasing order of index, so that the CCIDs are always
+ * activated as the last feature. This avoids the case where a CCID
+ * relies on the initialisation of one or more features that it depends
+ * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
+ */
+ for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
+ if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
+ __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
+ DCCP_CRIT("Could not activate %d", idx);
+ goto activation_failed;
+ }
+
+ /* Clean up Change options which have been confirmed already */
+ list_for_each_entry_safe(cur, next, fn_list, node)
+ if (!cur->needs_confirm)
+ dccp_feat_list_pop(cur);
+
+ dccp_pr_debug("Activation OK\n");
+ return 0;
+
+activation_failed:
+ /*
+ * We clean up everything that may have been allocated, since
+ * it is difficult to track at which stage negotiation failed.
+ * This is ok, since all allocation functions below are robust
+ * against NULL arguments.
+ */
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ return -1;
+}
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
new file mode 100644
index 00000000000..0e75cebb218
--- /dev/null
+++ b/net/dccp/feat.h
@@ -0,0 +1,137 @@
+#ifndef _DCCP_FEAT_H
+#define _DCCP_FEAT_H
+/*
+ * net/dccp/feat.h
+ *
+ * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
+ * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
+ * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/types.h>
+#include "dccp.h"
+
+/*
+ * Known limit values
+ */
+/* Ack Ratio takes 2-byte integer values (11.3) */
+#define DCCPF_ACK_RATIO_MAX 0xFFFF
+/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
+#define DCCPF_SEQ_WMIN 32
+#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
+/* Maximum number of SP values that fit in a single (Confirm) option */
+#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
+
+enum dccp_feat_type {
+ FEAT_AT_RX = 1, /* located at RX side of half-connection */
+ FEAT_AT_TX = 2, /* located at TX side of half-connection */
+ FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
+ FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
+ FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
+};
+
+enum dccp_feat_state {
+ FEAT_DEFAULT = 0, /* using default values from 6.4 */
+ FEAT_INITIALISING, /* feature is being initialised */
+ FEAT_CHANGING, /* Change sent but not confirmed yet */
+ FEAT_UNSTABLE, /* local modification in state CHANGING */
+ FEAT_STABLE /* both ends (think they) agree */
+};
+
+/**
+ * dccp_feat_val - Container for SP or NN feature values
+ * @nn: single NN value
+ * @sp.vec: single SP value plus optional preference list
+ * @sp.len: length of @sp.vec in bytes
+ */
+typedef union {
+ u64 nn;
+ struct {
+ u8 *vec;
+ u8 len;
+ } sp;
+} dccp_feat_val;
+
+/**
+ * struct feat_entry - Data structure to perform feature negotiation
+ * @val: feature's current value (SP features may have preference list)
+ * @state: feature's current state
+ * @feat_num: one of %dccp_feature_numbers
+ * @needs_mandatory: whether Mandatory options should be sent
+ * @needs_confirm: whether to send a Confirm instead of a Change
+ * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
+ * @is_local: feature location (1) or feature-remote (0)
+ * @node: list pointers, entries arranged in FIFO order
+ */
+struct dccp_feat_entry {
+ dccp_feat_val val;
+ enum dccp_feat_state state:8;
+ u8 feat_num;
+
+ bool needs_mandatory,
+ needs_confirm,
+ empty_confirm,
+ is_local;
+
+ struct list_head node;
+};
+
+static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
+{
+ if (entry->needs_confirm)
+ return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
+ return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
+}
+
+/**
+ * struct ccid_dependency - Track changes resulting from choosing a CCID
+ * @dependent_feat: one of %dccp_feature_numbers
+ * @is_local: local (1) or remote (0) @dependent_feat
+ * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
+ * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
+ */
+struct ccid_dependency {
+ u8 dependent_feat;
+ bool is_local:1,
+ is_mandatory:1;
+ u8 val;
+};
+
+/*
+ * Sysctls to seed defaults for feature negotiation
+ */
+extern unsigned long sysctl_dccp_sequence_window;
+extern int sysctl_dccp_rx_ccid;
+extern int sysctl_dccp_tx_ccid;
+
+int dccp_feat_init(struct sock *sk);
+void dccp_feat_initialise_sysctls(void);
+int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
+ u8 const *list, u8 len);
+int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
+ u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
+int dccp_feat_clone_list(struct list_head const *, struct list_head *);
+
+/*
+ * Encoding variable-length options and their maximum length.
+ *
+ * This affects NN options (SP options are all u8) and other variable-length
+ * options (see table 3 in RFC 4340). The limit is currently given the Sequence
+ * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
+ * options consume less than 6 bytes (timestamps are 4 bytes).
+ * When updating this constant (e.g. due to new internet drafts / RFCs), make
+ * sure that you also update all code which refers to it.
+ */
+#define DCCP_OPTVAL_MAXLEN 6
+
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
+u64 dccp_decode_value_var(const u8 *bf, const u8 len);
+u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
+
+int dccp_insert_option_mandatory(struct sk_buff *skb);
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
+ bool repeat_first);
+#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
index b6cba72b44e..3c8ec7d4a34 100644
--- a/net/dccp/input.c
+++ b/net/dccp/input.c
@@ -1,6 +1,6 @@
/*
* net/dccp/input.c
- *
+ *
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
@@ -10,9 +10,9 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/sock.h>
@@ -20,26 +20,80 @@
#include "ccid.h"
#include "dccp.h"
-static void dccp_fin(struct sock *sk, struct sk_buff *skb)
+/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
+int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8;
+
+static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
{
- sk->sk_shutdown |= RCV_SHUTDOWN;
- sock_set_flag(sk, SOCK_DONE);
__skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
+ sk->sk_data_ready(sk);
}
-static void dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
+static void dccp_fin(struct sock *sk, struct sk_buff *skb)
{
- dccp_v4_send_reset(sk, DCCP_RESET_CODE_CLOSED);
- dccp_fin(sk, skb);
- dccp_set_state(sk, DCCP_CLOSED);
- sk_wake_async(sk, 1, POLL_HUP);
+ /*
+ * On receiving Close/CloseReq, both RD/WR shutdown are performed.
+ * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
+ * receiving the closing segment, but there is no guarantee that such
+ * data will be processed at all.
+ */
+ sk->sk_shutdown = SHUTDOWN_MASK;
+ sock_set_flag(sk, SOCK_DONE);
+ dccp_enqueue_skb(sk, skb);
}
-static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
{
+ int queued = 0;
+
+ switch (sk->sk_state) {
+ /*
+ * We ignore Close when received in one of the following states:
+ * - CLOSED (may be a late or duplicate packet)
+ * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier)
+ * - RESPOND (already handled by dccp_check_req)
+ */
+ case DCCP_CLOSING:
+ /*
+ * Simultaneous-close: receiving a Close after sending one. This
+ * can happen if both client and server perform active-close and
+ * will result in an endless ping-pong of crossing and retrans-
+ * mitted Close packets, which only terminates when one of the
+ * nodes times out (min. 64 seconds). Quicker convergence can be
+ * achieved when one of the nodes acts as tie-breaker.
+ * This is ok as both ends are done with data transfer and each
+ * end is just waiting for the other to acknowledge termination.
+ */
+ if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
+ break;
+ /* fall through */
+ case DCCP_REQUESTING:
+ case DCCP_ACTIVE_CLOSEREQ:
+ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+ dccp_done(sk);
+ break;
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ /* Give waiting application a chance to read pending data */
+ queued = 1;
+ dccp_fin(sk, skb);
+ dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
+ /* fall through */
+ case DCCP_PASSIVE_CLOSE:
+ /*
+ * Retransmitted Close: we have already enqueued the first one.
+ */
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ }
+ return queued;
+}
+
+static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
+{
+ int queued = 0;
+
/*
* Step 7: Check for unexpected packet types
* If (S.is_server and P.type == CloseReq)
@@ -48,28 +102,96 @@ static void dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
*/
if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
- return;
+ return queued;
}
- if (sk->sk_state != DCCP_CLOSING)
+ /* Step 13: process relevant Client states < CLOSEREQ */
+ switch (sk->sk_state) {
+ case DCCP_REQUESTING:
+ dccp_send_close(sk, 0);
dccp_set_state(sk, DCCP_CLOSING);
- dccp_send_close(sk, 0);
+ break;
+ case DCCP_OPEN:
+ case DCCP_PARTOPEN:
+ /* Give waiting application a chance to read pending data */
+ queued = 1;
+ dccp_fin(sk, skb);
+ dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
+ /* fall through */
+ case DCCP_PASSIVE_CLOSEREQ:
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
+ }
+ return queued;
}
-static inline void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb)
+static u16 dccp_reset_code_convert(const u8 code)
{
- struct dccp_sock *dp = dccp_sk(sk);
+ const u16 error_code[] = {
+ [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
+ [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
+ [DCCP_RESET_CODE_ABORTED] = ECONNRESET,
+
+ [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED,
+ [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED,
+ [DCCP_RESET_CODE_TOO_BUSY] = EUSERS,
+ [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT,
+
+ [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG,
+ [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR,
+ [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC,
+ [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ,
+ [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP,
+ };
+
+ return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code];
+}
+
+static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
+{
+ u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
- if (dp->dccps_options.dccpo_send_ack_vector)
- dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ sk->sk_err = err;
+
+ /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */
+ dccp_fin(sk, skb);
+
+ if (err && !sock_flag(sk, SOCK_DEAD))
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+}
+
+static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
+{
+ struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
+
+ if (av == NULL)
+ return;
+ if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ dccp_ackvec_input(av, skb);
+}
+
+static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
+{
+ const struct dccp_sock *dp = dccp_sk(sk);
+
+ /* Don't deliver to RX CCID when node has shut down read end. */
+ if (!(sk->sk_shutdown & RCV_SHUTDOWN))
+ ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
+ /*
+ * Until the TX queue has been drained, we can not honour SHUT_WR, since
+ * we need received feedback as input to adjust congestion control.
+ */
+ if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
+ ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
}
static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
struct dccp_sock *dp = dccp_sk(sk);
- u64 lswl, lawl;
+ u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq,
+ ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
/*
* Step 5: Prepare sequence numbers for Sync
@@ -83,16 +205,15 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
* Otherwise,
* Drop packet and return
*/
- if (dh->dccph_type == DCCP_PKT_SYNC ||
+ if (dh->dccph_type == DCCP_PKT_SYNC ||
dh->dccph_type == DCCP_PKT_SYNCACK) {
- if (between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dp->dccps_awl, dp->dccps_awh) &&
- !before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_swl))
- dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+ if (between48(ackno, dp->dccps_awl, dp->dccps_awh) &&
+ dccp_delta_seqno(dp->dccps_swl, seqno) >= 0)
+ dccp_update_gsr(sk, seqno);
else
return -1;
}
-
+
/*
* Step 6: Check sequence numbers
* Let LSWL = S.SWL and LAWL = S.AWL
@@ -103,9 +224,6 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
* Update S.GSR, S.SWL, S.SWH
* If P.type != Sync,
* Update S.GAR
- * Otherwise,
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
*/
lswl = dp->dccps_swl;
lawl = dp->dccps_awl;
@@ -113,47 +231,61 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
dh->dccph_type == DCCP_PKT_CLOSE ||
dh->dccph_type == DCCP_PKT_RESET) {
- lswl = dp->dccps_gsr;
- dccp_inc_seqno(&lswl);
+ lswl = ADD48(dp->dccps_gsr, 1);
lawl = dp->dccps_gar;
}
- if (between48(DCCP_SKB_CB(skb)->dccpd_seq, lswl, dp->dccps_swh) &&
- (DCCP_SKB_CB(skb)->dccpd_ack_seq == DCCP_PKT_WITHOUT_ACK_SEQ ||
- between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- lawl, dp->dccps_awh))) {
- dccp_update_gsr(sk, DCCP_SKB_CB(skb)->dccpd_seq);
+ if (between48(seqno, lswl, dp->dccps_swh) &&
+ (ackno == DCCP_PKT_WITHOUT_ACK_SEQ ||
+ between48(ackno, lawl, dp->dccps_awh))) {
+ dccp_update_gsr(sk, seqno);
if (dh->dccph_type != DCCP_PKT_SYNC &&
- (DCCP_SKB_CB(skb)->dccpd_ack_seq !=
- DCCP_PKT_WITHOUT_ACK_SEQ))
- dp->dccps_gar = DCCP_SKB_CB(skb)->dccpd_ack_seq;
+ ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
+ after48(ackno, dp->dccps_gar))
+ dp->dccps_gar = ackno;
} else {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: Step 6 failed for %s packet, "
- "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
- "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
- "sending SYNC...\n",
- dccp_packet_name(dh->dccph_type),
- (unsigned long long) lswl,
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_seq,
- (unsigned long long) dp->dccps_swh,
- (DCCP_SKB_CB(skb)->dccpd_ack_seq ==
- DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist" : "exists",
- (unsigned long long) lawl,
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long) dp->dccps_awh);
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
+ unsigned long now = jiffies;
+ /*
+ * Step 6: Check sequence numbers
+ * Otherwise,
+ * If P.type == Reset,
+ * Send Sync packet acknowledging S.GSR
+ * Otherwise,
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ *
+ * These Syncs are rate-limited as per RFC 4340, 7.5.4:
+ * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second.
+ */
+ if (time_before(now, (dp->dccps_rate_last +
+ sysctl_dccp_sync_ratelimit)))
+ return -1;
+
+ DCCP_WARN("Step 6 failed for %s packet, "
+ "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
+ "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
+ "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
+ (unsigned long long) lswl, (unsigned long long) seqno,
+ (unsigned long long) dp->dccps_swh,
+ (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist"
+ : "exists",
+ (unsigned long long) lawl, (unsigned long long) ackno,
+ (unsigned long long) dp->dccps_awh);
+
+ dp->dccps_rate_last = now;
+
+ if (dh->dccph_type == DCCP_PKT_RESET)
+ seqno = dp->dccps_gsr;
+ dccp_send_sync(sk, seqno, DCCP_PKT_SYNC);
return -1;
}
return 0;
}
-static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned len)
+static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
+ const struct dccp_hdr *dh, const unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
@@ -161,13 +293,11 @@ static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
case DCCP_PKT_DATAACK:
case DCCP_PKT_DATA:
/*
- * FIXME: check if sk_receive_queue is full, schedule DATA_DROPPED
- * option if it is.
+ * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
+ * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
+ * - sk_receive_queue is full, use Code 2, "Receive Buffer"
*/
- __skb_pull(skb, dh->dccph_doff * 4);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk, 0);
+ dccp_enqueue_skb(sk, skb);
return 0;
case DCCP_PKT_ACK:
goto discard;
@@ -179,19 +309,20 @@ static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
* S.state := TIMEWAIT
* Set TIMEWAIT timer
* Drop packet and return
- */
- dccp_fin(sk, skb);
- dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+ */
+ dccp_rcv_reset(sk, skb);
return 0;
case DCCP_PKT_CLOSEREQ:
- dccp_rcv_closereq(sk, skb);
+ if (dccp_rcv_closereq(sk, skb))
+ return 0;
goto discard;
case DCCP_PKT_CLOSE:
- dccp_rcv_close(sk, skb);
- return 0;
+ if (dccp_rcv_close(sk, skb))
+ return 0;
+ goto discard;
case DCCP_PKT_REQUEST:
- /* Step 7
- * or (S.is_server and P.type == Response)
+ /* Step 7
+ * or (S.is_server and P.type == Response)
* or (S.is_client and P.type == Request)
* or (S.state >= OPEN and P.type == Request
* and P.seqno >= S.OSR)
@@ -208,7 +339,8 @@ static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
if (dp->dccps_role != DCCP_ROLE_CLIENT)
goto send_sync;
check_seq:
- if (!before48(DCCP_SKB_CB(skb)->dccpd_seq, dp->dccps_osr)) {
+ if (dccp_delta_seqno(dp->dccps_osr,
+ DCCP_SKB_CB(skb)->dccpd_seq) >= 0) {
send_sync:
dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
DCCP_PKT_SYNC);
@@ -218,11 +350,11 @@ send_sync:
dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
DCCP_PKT_SYNCACK);
/*
- * From the draft:
+ * From RFC 4340, sec. 5.7
*
* As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
* MAY have non-zero-length application data areas, whose
- * contents * receivers MUST ignore.
+ * contents receivers MUST ignore.
*/
goto discard;
}
@@ -234,27 +366,16 @@ discard:
}
int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned len)
+ const struct dccp_hdr *dh, const unsigned int len)
{
- struct dccp_sock *dp = dccp_sk(sk);
-
if (dccp_check_seqno(sk, skb))
goto discard;
- if (dccp_parse_options(sk, skb))
- goto discard;
-
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+ dccp_handle_ackvec_processing(sk, skb);
+ dccp_deliver_input_to_ccids(sk, skb);
return __dccp_rcv_established(sk, skb, dh, len);
discard:
@@ -267,9 +388,9 @@ EXPORT_SYMBOL_GPL(dccp_rcv_established);
static int dccp_rcv_request_sent_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
- const unsigned len)
+ const unsigned int len)
{
- /*
+ /*
* Step 4: Prepare sequence numbers in REQUEST
* If S.state == REQUEST,
* If (P.type == Response or P.type == Reset)
@@ -283,51 +404,45 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
if (dh->dccph_type == DCCP_PKT_RESPONSE) {
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
-
- /* Stop the REQUEST timer */
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
- BUG_TRAP(sk->sk_send_head != NULL);
- __kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
+ long tstamp = dccp_timestamp();
if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
dp->dccps_awl, dp->dccps_awh)) {
dccp_pr_debug("invalid ackno: S.AWL=%llu, "
- "P.ackno=%llu, S.AWH=%llu \n",
+ "P.ackno=%llu, S.AWH=%llu\n",
(unsigned long long)dp->dccps_awl,
(unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
(unsigned long long)dp->dccps_awh);
goto out_invalid_packet;
}
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto out_invalid_packet; /* FIXME: change error code */
+ /*
+ * If option processing (Step 8) failed, return 1 here so that
+ * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
+ * the option type and is set in dccp_parse_options().
+ */
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+ if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
+ dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
+ dp->dccps_options_received.dccpor_timestamp_echo));
+
+ /* Stop the REQUEST timer */
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
+ WARN_ON(sk->sk_send_head == NULL);
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
- dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_update_gsr(sk, dp->dccps_isr);
/*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- *
- * AWL was adjusted in dccp_v4_connect -acme
+ * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
+ * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
+ * is done as part of activating the feature values below, since
+ * these settings depend on the local/remote Sequence Window
+ * features, which were undefined or not confirmed until now.
*/
- dccp_set_seqno(&dp->dccps_swl,
- max48(dp->dccps_swl, dp->dccps_isr));
-
- if (ccid_hc_rx_init(dp->dccps_hc_rx_ccid, sk) != 0 ||
- ccid_hc_tx_init(dp->dccps_hc_tx_ccid, sk) != 0) {
- ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
- /* FIXME: send appropriate RESET code */
- goto out_invalid_packet;
- }
+ dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -342,18 +457,27 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
* from the Response * /
* S.state := PARTOPEN
* Set PARTOPEN timer
- * Continue with S.state == PARTOPEN
+ * Continue with S.state == PARTOPEN
* / * Step 12 will send the Ack completing the
* three-way handshake * /
*/
dccp_set_state(sk, DCCP_PARTOPEN);
+ /*
+ * If feature negotiation was successful, activate features now;
+ * an activation failure means that this host could not activate
+ * one ore more features (e.g. insufficient memory), which would
+ * leave at least one feature in an undefined state.
+ */
+ if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
+ goto unable_to_proceed;
+
/* Make sure socket is routed, for correct metrics. */
icsk->icsk_af_ops->rebuild_header(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
}
if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
@@ -373,7 +497,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
*/
__kfree_skb(skb);
return 0;
- }
+ }
dccp_send_ack(sk);
return -1;
}
@@ -381,14 +505,26 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk,
out_invalid_packet:
/* dccp_v4_do_rcv will send a reset */
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
- return 1;
+ return 1;
+
+unable_to_proceed:
+ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
+ /*
+ * We mark this socket as no longer usable, so that the loop in
+ * dccp_sendmsg() terminates and the application gets notified.
+ */
+ dccp_set_state(sk, DCCP_CLOSED);
+ sk->sk_err = ECOMM;
+ return 1;
}
static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
struct sk_buff *skb,
const struct dccp_hdr *dh,
- const unsigned len)
+ const unsigned int len)
{
+ struct dccp_sock *dp = dccp_sk(sk);
+ u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
int queued = 0;
switch (dh->dccph_type) {
@@ -413,7 +549,14 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
if (sk->sk_state == DCCP_PARTOPEN)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- dccp_sk(sk)->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
+ /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
+ if (likely(sample)) {
+ long delta = dccp_timestamp() - sample;
+
+ dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
+ }
+
+ dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
dccp_set_state(sk, DCCP_OPEN);
if (dh->dccph_type == DCCP_PKT_DATAACK ||
@@ -429,7 +572,7 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
}
int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned len)
+ struct dccp_hdr *dh, unsigned int len)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
@@ -438,37 +581,31 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/*
* Step 3: Process LISTEN state
- * (Continuing from dccp_v4_do_rcv and dccp_v6_do_rcv)
*
* If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie
- * option,
- * * Must scan the packet's options to check for an Init
- * Cookie. Only the Init Cookie is processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *
- * * Generate a new socket and switch to that socket *
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookie
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- * Continue with S.state == RESPOND
- * * A Response packet will be generated in Step 11 *
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
+ * If P.type == Request or P contains a valid Init Cookie option,
+ * (* Must scan the packet's options to check for Init
+ * Cookies. Only Init Cookies are processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *)
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
+ * Cookies Continue with S.state == RESPOND
+ * (* A Response packet will be generated in Step 11 *)
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
*/
if (sk->sk_state == DCCP_LISTEN) {
if (dh->dccph_type == DCCP_PKT_REQUEST) {
if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
skb) < 0)
return 1;
-
- /* FIXME: do congestion control initialization */
goto discard;
}
if (dh->dccph_type == DCCP_PKT_RESET)
@@ -477,31 +614,36 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* Caller (dccp_v4_do_rcv) will send Reset */
dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
return 1;
+ } else if (sk->sk_state == DCCP_CLOSED) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
+ return 1;
}
- if (sk->sk_state != DCCP_REQUESTING) {
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- /*
- * Step 8: Process options and mark acknowledgeable
- */
- if (dccp_parse_options(sk, skb))
- goto discard;
-
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_event_ack_recv(sk, skb);
-
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk,
- DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_ACKVEC_STATE_RECEIVED))
- goto discard;
+ /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
+ if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
+ goto discard;
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
+ /*
+ * Step 7: Check for unexpected packet types
+ * If (S.is_server and P.type == Response)
+ * or (S.is_client and P.type == Request)
+ * or (S.state == RESPOND and P.type == Data),
+ * Send Sync packet acknowledging P.seqno
+ * Drop packet and return
+ */
+ if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_RESPONSE) ||
+ (dp->dccps_role == DCCP_ROLE_CLIENT &&
+ dh->dccph_type == DCCP_PKT_REQUEST) ||
+ (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
+ goto discard;
}
+ /* Step 8: Process options */
+ if (dccp_parse_options(sk, NULL, skb))
+ return 1;
+
/*
* Step 9: Process Reset
* If P.type == Reset,
@@ -509,54 +651,22 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
* S.state := TIMEWAIT
* Set TIMEWAIT timer
* Drop packet and return
- */
+ */
if (dh->dccph_type == DCCP_PKT_RESET) {
- /*
- * Queue the equivalent of TCP fin so that dccp_recvmsg
- * exits the loop
- */
- dccp_fin(sk, skb);
- dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
+ dccp_rcv_reset(sk, skb);
return 0;
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == CloseReq)
- * or (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- } else if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
- (dh->dccph_type == DCCP_PKT_RESPONSE ||
- dh->dccph_type == DCCP_PKT_CLOSEREQ)) ||
- (dp->dccps_role == DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_REQUEST) ||
- (sk->sk_state == DCCP_RESPOND &&
- dh->dccph_type == DCCP_PKT_DATA)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) {
- dccp_rcv_closereq(sk, skb);
+ } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
+ if (dccp_rcv_closereq(sk, skb))
+ return 0;
goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSE) {
- dccp_rcv_close(sk, skb);
- return 0;
- }
-
- if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
+ } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
+ if (dccp_rcv_close(sk, skb))
+ return 0;
goto discard;
}
switch (sk->sk_state) {
- case DCCP_CLOSED:
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
-
case DCCP_REQUESTING:
- /* FIXME: do congestion control initialization */
-
queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
if (queued >= 0)
return queued;
@@ -564,8 +674,12 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
__kfree_skb(skb);
return 0;
- case DCCP_RESPOND:
case DCCP_PARTOPEN:
+ /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
+ dccp_handle_ackvec_processing(sk, skb);
+ dccp_deliver_input_to_ccids(sk, skb);
+ /* fall through */
+ case DCCP_RESPOND:
queued = dccp_rcv_respond_partopen_state_process(sk, skb,
dh, len);
break;
@@ -576,12 +690,15 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
switch (old_state) {
case DCCP_PARTOPEN:
sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
break;
}
+ } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
+ dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
+ goto discard;
}
- if (!queued) {
+ if (!queued) {
discard:
__kfree_skb(skb);
}
@@ -589,3 +706,27 @@ discard:
}
EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
+
+/**
+ * dccp_sample_rtt - Validate and finalise computation of RTT sample
+ * @delta: number of microseconds between packet and acknowledgment
+ *
+ * The routine is kept generic to work in different contexts. It should be
+ * called immediately when the ACK used for the RTT sample arrives.
+ */
+u32 dccp_sample_rtt(struct sock *sk, long delta)
+{
+ /* dccpor_elapsed_time is either zeroed out or set and > 0 */
+ delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
+
+ if (unlikely(delta <= 0)) {
+ DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
+ return DCCP_SANE_RTT_MIN;
+ }
+ if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
+ DCCP_WARN("RTT sample %ld too large, using max\n", delta);
+ return DCCP_SANE_RTT_MAX;
+ }
+
+ return delta;
+}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 00f98322667..6ca645c4b48 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -10,66 +10,49 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/icmp.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/random.h>
#include <net/icmp.h>
+#include <net/inet_common.h>
#include <net/inet_hashtables.h>
#include <net/inet_sock.h>
+#include <net/protocol.h>
#include <net/sock.h>
#include <net/timewait_sock.h>
#include <net/tcp_states.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
+#include "feat.h"
-struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
- .lhash_lock = RW_LOCK_UNLOCKED,
- .lhash_users = ATOMIC_INIT(0),
- .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-};
-
-EXPORT_SYMBOL_GPL(dccp_hashinfo);
-
-static int dccp_v4_get_port(struct sock *sk, const unsigned short snum)
-{
- return inet_csk_get_port(&dccp_hashinfo, sk, snum,
- inet_csk_bind_conflict);
-}
-
-static void dccp_v4_hash(struct sock *sk)
-{
- inet_hash(&dccp_hashinfo, sk);
-}
-
-void dccp_unhash(struct sock *sk)
-{
- inet_unhash(&dccp_hashinfo, sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_unhash);
+/*
+ * The per-net dccp.v4_ctl_sk socket is used for responding to
+ * the Out-of-the-blue (OOTB) packets. A control sock will be created
+ * for this socket at the initialization time.
+ */
int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
+ const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ __be16 orig_sport, orig_dport;
+ __be32 daddr, nexthop;
+ struct flowi4 *fl4;
struct rtable *rt;
- u32 daddr, nexthop;
- int tmp;
int err;
+ struct ip_options_rcu *inet_opt;
dp->dccps_role = DCCP_ROLE_CLIENT;
- if (dccp_service_not_initialized(sk))
- return -EPROTO;
-
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -77,37 +60,43 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
return -EAFNOSUPPORT;
nexthop = daddr = usin->sin_addr.s_addr;
- if (inet->opt != NULL && inet->opt->srr) {
+
+ inet_opt = rcu_dereference_protected(inet->inet_opt,
+ sock_owned_by_user(sk));
+ if (inet_opt != NULL && inet_opt->opt.srr) {
if (daddr == 0)
return -EINVAL;
- nexthop = inet->opt->faddr;
+ nexthop = inet_opt->opt.faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_DCCP,
- inet->sport, usin->sin_port, sk);
- if (tmp < 0)
- return tmp;
+ orig_sport = inet->inet_sport;
+ orig_dport = usin->sin_port;
+ fl4 = &inet->cork.fl.u.ip4;
+ rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
+ RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+ IPPROTO_DCCP,
+ orig_sport, orig_dport, sk);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
ip_rt_put(rt);
return -ENETUNREACH;
}
- if (inet->opt == NULL || !inet->opt->srr)
- daddr = rt->rt_dst;
+ if (inet_opt == NULL || !inet_opt->opt.srr)
+ daddr = fl4->daddr;
- if (inet->saddr == 0)
- inet->saddr = rt->rt_src;
- inet->rcv_saddr = inet->saddr;
+ if (inet->inet_saddr == 0)
+ inet->inet_saddr = fl4->saddr;
+ inet->inet_rcv_saddr = inet->inet_saddr;
- inet->dport = usin->sin_port;
- inet->daddr = daddr;
+ inet->inet_dport = usin->sin_port;
+ inet->inet_daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet->opt != NULL)
- inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
+ if (inet_opt)
+ inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
/*
* Socket identity is still unknown (sport may be zero).
* However we set state to DCCP_REQUESTING and not releasing socket
@@ -119,21 +108,21 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (err != 0)
goto failure;
- err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
- if (err != 0)
+ rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
+ inet->inet_sport, inet->inet_dport, sk);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ rt = NULL;
goto failure;
-
+ }
/* OK, now commit destination to socket. */
- sk_setup_caps(sk, &rt->u.dst);
-
- dp->dccps_gar =
- dp->dccps_iss = secure_dccp_sequence_number(inet->saddr,
- inet->daddr,
- inet->sport,
- usin->sin_port);
- dccp_update_gss(sk, dp->dccps_iss);
+ sk_setup_caps(sk, &rt->dst);
- inet->id = dp->dccps_iss ^ jiffies;
+ dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ inet->inet_dport);
+ inet->inet_id = dp->dccps_iss ^ jiffies;
err = dccp_connect(sk);
rt = NULL;
@@ -148,7 +137,7 @@ failure:
dccp_set_state(sk, DCCP_CLOSED);
ip_rt_put(rt);
sk->sk_route_caps = 0;
- inet->dport = 0;
+ inet->inet_dport = 0;
goto out;
}
@@ -172,17 +161,10 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
if (sk->sk_state == DCCP_LISTEN)
return;
- /* We don't check in the destentry if pmtu discovery is forbidden
- * on this route. We just assume that no packet_to_big packets
- * are send back when pmtu discovery is not active.
- * There is a small race when the user changes this flag in the
- * route, but I think that's acceptable.
- */
- if ((dst = __sk_dst_check(sk, 0)) == NULL)
+ dst = inet_csk_update_pmtu(sk, mtu);
+ if (!dst)
return;
- dst->ops->update_pmtu(dst, mtu);
-
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
@@ -192,11 +174,12 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
mtu = dst_mtu(dst);
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+ ip_sk_accept_pmtu(sk) &&
inet_csk(sk)->icsk_pmtu_cookie > mtu) {
dccp_sync_mss(sk, mtu);
/*
- * From: draft-ietf-dccp-spec-11.txt
+ * From RFC 4340, sec. 14.1:
*
* DCCP-Sync packets are the best choice for upward
* probing, since DCCP-Sync probes do not risk application
@@ -206,86 +189,12 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk,
} /* else let the usual retransmit timer handle it */
}
-static void dccp_v4_ctl_send_ack(struct sk_buff *rxskb)
-{
- int err;
- struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
- const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_ack_bits);
- struct sk_buff *skb;
-
- if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
- return;
-
- skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
- if (skb == NULL)
- return;
-
- /* Reserve space for headers. */
- skb_reserve(skb, MAX_DCCP_HEADER);
-
- skb->dst = dst_clone(rxskb->dst);
-
- skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
- dh = dccp_hdr(skb);
- memset(dh, 0, dccp_hdr_ack_len);
-
- /* Build DCCP header and checksum it. */
- dh->dccph_type = DCCP_PKT_ACK;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_ack_len / 4;
- dh->dccph_x = 1;
-
- dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
- DCCP_SKB_CB(rxskb)->dccpd_seq);
-
- bh_lock_sock(dccp_ctl_socket->sk);
- err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
- rxskb->nh.iph->daddr,
- rxskb->nh.iph->saddr, NULL);
- bh_unlock_sock(dccp_ctl_socket->sk);
-
- if (err == NET_XMIT_CN || err == 0) {
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- }
-}
-
-static void dccp_v4_reqsk_send_ack(struct sk_buff *skb,
- struct request_sock *req)
-{
- dccp_v4_ctl_send_ack(skb);
-}
-
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
{
- int err = -1;
- struct sk_buff *skb;
-
- /* First, grab a route. */
-
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto out;
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- const struct inet_request_sock *ireq = inet_rsk(req);
-
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
- ireq->rmt_addr,
- ireq->opt);
- if (err == NET_XMIT_CN)
- err = 0;
- }
+ struct dst_entry *dst = __sk_dst_check(sk, 0);
-out:
- dst_release(dst);
- return err;
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
}
/*
@@ -300,33 +209,36 @@ out:
* check at all. A more general error queue to queue errors for later handling
* is probably better.
*/
-void dccp_v4_err(struct sk_buff *skb, u32 info)
+static void dccp_v4_err(struct sk_buff *skb, u32 info)
{
const struct iphdr *iph = (struct iphdr *)skb->data;
- const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data +
- (iph->ihl << 2));
+ const u8 offset = iph->ihl << 2;
+ const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
struct dccp_sock *dp;
struct inet_sock *inet;
- const int type = skb->h.icmph->type;
- const int code = skb->h.icmph->code;
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
struct sock *sk;
__u64 seq;
int err;
+ struct net *net = dev_net(skb->dev);
- if (skb->len < (iph->ihl << 2) + 8) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ if (skb->len < offset + sizeof(*dh) ||
+ skb->len < offset + __dccp_basic_hdr_len(dh)) {
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
}
- sk = inet_lookup(&dccp_hashinfo, iph->daddr, dh->dccph_dport,
- iph->saddr, dh->dccph_sport, inet_iif(skb));
+ sk = inet_lookup(net, &dccp_hashinfo,
+ iph->daddr, dh->dccph_dport,
+ iph->saddr, dh->dccph_sport, inet_iif(skb));
if (sk == NULL) {
- ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+ ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put((struct inet_timewait_sock *)sk);
+ inet_twsk_put(inet_twsk(sk));
return;
}
@@ -335,20 +247,23 @@ void dccp_v4_err(struct sk_buff *skb, u32 info)
* servers this needs to be solved differently.
*/
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == DCCP_CLOSED)
goto out;
dp = dccp_sk(sk);
- seq = dccp_hdr_seq(skb);
- if (sk->sk_state != DCCP_LISTEN &&
- !between48(seq, dp->dccps_swl, dp->dccps_swh)) {
- NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+ seq = dccp_hdr_seq(dh);
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+ !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
switch (type) {
+ case ICMP_REDIRECT:
+ dccp_do_redirect(skb, sk);
+ goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
@@ -388,10 +303,11 @@ void dccp_v4_err(struct sk_buff *skb, u32 info)
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
- BUG_TRAP(!req->sk);
+ WARN_ON(req->sk);
- if (seq != dccp_rsk(req)->dreq_iss) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ if (!between48(seq, dccp_rsk(req)->dreq_iss,
+ dccp_rsk(req)->dreq_gss)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
/*
@@ -444,136 +360,33 @@ out:
sock_put(sk);
}
-/* This routine computes an IPv4 DCCP checksum. */
-void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
+ __be32 src, __be32 dst)
+{
+ return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
+}
+
+void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
{
const struct inet_sock *inet = inet_sk(sk);
struct dccp_hdr *dh = dccp_hdr(skb);
- dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr);
+ dccp_csum_outgoing(skb);
+ dh->dccph_checksum = dccp_v4_csum_finish(skb,
+ inet->inet_saddr,
+ inet->inet_daddr);
}
EXPORT_SYMBOL_GPL(dccp_v4_send_check);
-int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code)
+static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
{
- struct sk_buff *skb;
- /*
- * FIXME: what if rebuild_header fails?
- * Should we be doing a rebuild_header here?
- */
- int err = inet_sk_rebuild_header(sk);
-
- if (err != 0)
- return err;
-
- skb = dccp_make_reset(sk, sk->sk_dst_cache, code);
- if (skb != NULL) {
- const struct inet_sock *inet = inet_sk(sk);
-
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = ip_build_and_send_pkt(skb, sk,
- inet->saddr, inet->daddr, NULL);
- if (err == NET_XMIT_CN)
- err = 0;
- }
-
- return err;
-}
-
-static inline u64 dccp_v4_init_sequence(const struct sock *sk,
- const struct sk_buff *skb)
-{
- return secure_dccp_sequence_number(skb->nh.iph->daddr,
- skb->nh.iph->saddr,
+ return secure_dccp_sequence_number(ip_hdr(skb)->daddr,
+ ip_hdr(skb)->saddr,
dccp_hdr(skb)->dccph_dport,
dccp_hdr(skb)->dccph_sport);
}
-int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct inet_request_sock *ireq;
- struct dccp_sock dp;
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- const __u32 saddr = skb->nh.iph->saddr;
- const __u32 daddr = skb->nh.iph->daddr;
- const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
-
- /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
- if (((struct rtable *)skb->dst)->rt_flags &
- (RTCF_BROADCAST | RTCF_MULTICAST)) {
- reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- goto drop;
- }
-
- if (dccp_bad_service_code(sk, service)) {
- reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- /*
- * Accept backlog is full. If we have already queued enough
- * of warm entries in syn queue, drop request. It is better than
- * clogging syn queue with openreqs with exponentially increasing
- * timeout.
- */
- if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
- goto drop;
-
- req = reqsk_alloc(sk->sk_prot->rsk_prot);
- if (req == NULL)
- goto drop;
-
- /* FIXME: process options */
-
- dccp_openreq_init(req, &dp, skb);
-
- ireq = inet_rsk(req);
- ireq->loc_addr = daddr;
- ireq->rmt_addr = saddr;
- req->rcv_wnd = 100; /* Fake, option parsing will get the
- right value */
- ireq->opt = NULL;
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
- */
- dreq = dccp_rsk(req);
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_iss = dccp_v4_init_sequence(sk, skb);
- dreq->dreq_service = service;
-
- if (dccp_v4_send_response(sk, req, NULL))
- goto drop_and_free;
-
- inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
- dcb->dccpd_reset_code = reset_code;
- return -1;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
-
/*
* The three way handshake has completed - we got a valid ACK or DATAACK -
* now create the new socket.
@@ -586,46 +399,50 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
{
struct inet_request_sock *ireq;
struct inet_sock *newinet;
- struct dccp_sock *newdp;
struct sock *newsk;
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (dst == NULL && (dst = inet_csk_route_req(sk, req)) == NULL)
- goto exit;
-
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto exit;
+ goto exit_nonewsk;
- sk_setup_caps(newsk, dst);
-
- newdp = dccp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->daddr = ireq->rmt_addr;
- newinet->rcv_saddr = ireq->loc_addr;
- newinet->saddr = ireq->loc_addr;
- newinet->opt = ireq->opt;
+ newinet->inet_daddr = ireq->ir_rmt_addr;
+ newinet->inet_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
+ newinet->inet_opt = ireq->opt;
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
- newinet->mc_ttl = skb->nh.iph->ttl;
- newinet->id = jiffies;
+ newinet->mc_ttl = ip_hdr(skb)->ttl;
+ newinet->inet_id = jiffies;
+
+ if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
+ goto put_and_exit;
+
+ sk_setup_caps(newsk, dst);
dccp_sync_mss(newsk, dst_mtu(dst));
- __inet_hash(&dccp_hashinfo, newsk, 0);
- __inet_inherit_port(&dccp_hashinfo, sk, newsk);
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ __inet_hash_nolisten(newsk, NULL);
return newsk;
exit_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-exit:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
dst_release(dst);
+exit:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto exit;
}
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
@@ -633,7 +450,7 @@ EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
- const struct iphdr *iph = skb->nh.iph;
+ const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
@@ -643,147 +460,111 @@ static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
if (req != NULL)
return dccp_check_req(sk, skb, req, prev);
- nsk = __inet_lookup_established(&dccp_hashinfo,
- iph->saddr, dh->dccph_sport,
- iph->daddr, ntohs(dh->dccph_dport),
- inet_iif(skb));
+ nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
+ iph->saddr, dh->dccph_sport,
+ iph->daddr, dh->dccph_dport,
+ inet_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- inet_twsk_put((struct inet_timewait_sock *)nsk);
+ inet_twsk_put(inet_twsk(nsk));
return NULL;
}
return sk;
}
-int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr,
- const u32 daddr)
+static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
- const struct dccp_hdr* dh = dccp_hdr(skb);
- int checksum_len;
- u32 tmp;
-
- if (dh->dccph_cscov == 0)
- checksum_len = skb->len;
- else {
- checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
- checksum_len = checksum_len < skb->len ? checksum_len :
- skb->len;
+ struct rtable *rt;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .flowi4_oif = inet_iif(skb),
+ .daddr = iph->saddr,
+ .saddr = iph->daddr,
+ .flowi4_tos = RT_CONN_FLAGS(sk),
+ .flowi4_proto = sk->sk_protocol,
+ .fl4_sport = dccp_hdr(skb)->dccph_dport,
+ .fl4_dport = dccp_hdr(skb)->dccph_sport,
+ };
+
+ security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
+ if (IS_ERR(rt)) {
+ IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
+ return NULL;
}
- tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
- return csum_tcpudp_magic(saddr, daddr, checksum_len,
- IPPROTO_DCCP, tmp);
+ return &rt->dst;
}
-static int dccp_v4_verify_checksum(struct sk_buff *skb,
- const u32 saddr, const u32 daddr)
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
{
- struct dccp_hdr *dh = dccp_hdr(skb);
- int checksum_len;
- u32 tmp;
-
- if (dh->dccph_cscov == 0)
- checksum_len = skb->len;
- else {
- checksum_len = (dh->dccph_cscov + dh->dccph_x) * sizeof(u32);
- checksum_len = checksum_len < skb->len ? checksum_len :
- skb->len;
- }
- tmp = csum_partial((unsigned char *)dh, checksum_len, 0);
- return csum_tcpudp_magic(saddr, daddr, checksum_len,
- IPPROTO_DCCP, tmp) == 0 ? 0 : -1;
-}
+ int err = -1;
+ struct sk_buff *skb;
+ struct dst_entry *dst;
+ struct flowi4 fl4;
-static struct dst_entry* dccp_v4_route_skb(struct sock *sk,
- struct sk_buff *skb)
-{
- struct rtable *rt;
- struct flowi fl = { .oif = ((struct rtable *)skb->dst)->rt_iif,
- .nl_u = { .ip4_u =
- { .daddr = skb->nh.iph->saddr,
- .saddr = skb->nh.iph->daddr,
- .tos = RT_CONN_FLAGS(sk) } },
- .proto = sk->sk_protocol,
- .uli_u = { .ports =
- { .sport = dccp_hdr(skb)->dccph_dport,
- .dport = dccp_hdr(skb)->dccph_sport }
- }
- };
-
- if (ip_route_output_flow(&rt, &fl, sk, 0)) {
- IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
- return NULL;
+ dst = inet_csk_route_req(sk, &fl4, req);
+ if (dst == NULL)
+ goto out;
+
+ skb = dccp_make_response(sk, dst, req);
+ if (skb != NULL) {
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ struct dccp_hdr *dh = dccp_hdr(skb);
+
+ dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr);
+ err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+ ireq->ir_rmt_addr,
+ ireq->opt);
+ err = net_xmit_eval(err);
}
- return &rt->u.dst;
+out:
+ dst_release(dst);
+ return err;
}
-static void dccp_v4_ctl_send_reset(struct sk_buff *rxskb)
+static void dccp_v4_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
{
int err;
- struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
- const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
+ const struct iphdr *rxiph;
struct sk_buff *skb;
struct dst_entry *dst;
- u64 seqno;
+ struct net *net = dev_net(skb_dst(rxskb)->dev);
+ struct sock *ctl_sk = net->dccp.v4_ctl_sk;
/* Never send a reset in response to a reset. */
- if (rxdh->dccph_type == DCCP_PKT_RESET)
+ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
return;
- if (((struct rtable *)rxskb->dst)->rt_type != RTN_LOCAL)
+ if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
return;
- dst = dccp_v4_route_skb(dccp_ctl_socket->sk, rxskb);
+ dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
if (dst == NULL)
return;
- skb = alloc_skb(MAX_DCCP_HEADER + 15, GFP_ATOMIC);
+ skb = dccp_ctl_make_reset(ctl_sk, rxskb);
if (skb == NULL)
goto out;
- /* Reserve space for headers. */
- skb_reserve(skb, MAX_DCCP_HEADER);
- skb->dst = dst_clone(dst);
+ rxiph = ip_hdr(rxskb);
+ dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
+ rxiph->daddr);
+ skb_dst_set(skb, dst_clone(dst));
- skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
- dh = dccp_hdr(skb);
- memset(dh, 0, dccp_hdr_reset_len);
-
- /* Build DCCP header and checksum it. */
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_reset_len / 4;
- dh->dccph_x = 1;
- dccp_hdr_reset(skb)->dccph_reset_code =
- DCCP_SKB_CB(rxskb)->dccpd_reset_code;
-
- /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
- seqno = 0;
- if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
-
- dccp_hdr_set_seq(dh, seqno);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
- DCCP_SKB_CB(rxskb)->dccpd_seq);
-
- dh->dccph_checksum = dccp_v4_checksum(skb, rxskb->nh.iph->saddr,
- rxskb->nh.iph->daddr);
-
- bh_lock_sock(dccp_ctl_socket->sk);
- err = ip_build_and_send_pkt(skb, dccp_ctl_socket->sk,
- rxskb->nh.iph->daddr,
- rxskb->nh.iph->saddr, NULL);
- bh_unlock_sock(dccp_ctl_socket->sk);
-
- if (err == NET_XMIT_CN || err == 0) {
+ bh_lock_sock(ctl_sk);
+ err = ip_build_and_send_pkt(skb, ctl_sk,
+ rxiph->daddr, rxiph->saddr, NULL);
+ bh_unlock_sock(ctl_sk);
+
+ if (net_xmit_eval(err) == 0) {
DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
}
@@ -791,6 +572,107 @@ out:
dst_release(dst);
}
+static void dccp_v4_reqsk_destructor(struct request_sock *req)
+{
+ dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree(inet_rsk(req)->opt);
+}
+
+void dccp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
+{
+}
+EXPORT_SYMBOL(dccp_syn_ack_timeout);
+
+static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
+ .family = PF_INET,
+ .obj_size = sizeof(struct dccp_request_sock),
+ .rtx_syn_ack = dccp_v4_send_response,
+ .send_ack = dccp_reqsk_send_ack,
+ .destructor = dccp_v4_reqsk_destructor,
+ .send_reset = dccp_v4_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
+};
+
+int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+ struct inet_request_sock *ireq;
+ struct request_sock *req;
+ struct dccp_request_sock *dreq;
+ const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+
+ /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
+ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+ return 0; /* discard, don't send a reset here */
+
+ if (dccp_bad_service_code(sk, service)) {
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ goto drop;
+ }
+ /*
+ * TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+ if (inet_csk_reqsk_queue_is_full(sk))
+ goto drop;
+
+ /*
+ * Accept backlog is full. If we have already queued enough
+ * of warm entries in syn queue, drop request. It is better than
+ * clogging syn queue with openreqs with exponentially increasing
+ * timeout.
+ */
+ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
+ goto drop;
+
+ req = inet_reqsk_alloc(&dccp_request_sock_ops);
+ if (req == NULL)
+ goto drop;
+
+ if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+ goto drop_and_free;
+
+ dreq = dccp_rsk(req);
+ if (dccp_parse_options(sk, dreq, skb))
+ goto drop_and_free;
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
+
+ ireq = inet_rsk(req);
+ ireq->ir_loc_addr = ip_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
+
+ /*
+ * Step 3: Process LISTEN state
+ *
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ *
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
+ */
+ dreq->dreq_isr = dcb->dccpd_seq;
+ dreq->dreq_gsr = dreq->dreq_isr;
+ dreq->dreq_iss = dccp_v4_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
+ dreq->dreq_service = service;
+
+ if (dccp_v4_send_response(sk, req))
+ goto drop_and_free;
+
+ inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
+ return 0;
+
+drop_and_free:
+ reqsk_free(req);
+drop:
+ DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
+ return -1;
+}
+
+EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
+
int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
struct dccp_hdr *dh = dccp_hdr(skb);
@@ -803,24 +685,23 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
/*
* Step 3: Process LISTEN state
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie
- * option,
- * * Must scan the packet's options to check for an Init
- * Cookie. Only the Init Cookie is processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *
- * * Generate a new socket and switch to that socket *
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookie
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- * Continue with S.state == RESPOND
- * * A Response packet will be generated in Step 11 *
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
+ * If P.type == Request or P contains a valid Init Cookie option,
+ * (* Must scan the packet's options to check for Init
+ * Cookies. Only Init Cookies are processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *)
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+ * Continue with S.state == RESPOND
+ * (* A Response packet will be generated in Step 11 *)
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
*
* NOTE: the check for the packet types is done in
* dccp_rcv_state_process
@@ -843,7 +724,7 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
reset:
- dccp_v4_ctl_send_reset(skb);
+ dccp_v4_ctl_send_reset(sk, skb);
discard:
kfree_skb(skb);
return 0;
@@ -851,95 +732,108 @@ discard:
EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
+/**
+ * dccp_invalid_packet - check for malformed packets
+ * Implements RFC 4340, 8.5: Step 1: Check header basics
+ * Packets that fail these checks are ignored and do not receive Resets.
+ */
int dccp_invalid_packet(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
+ unsigned int cscov;
if (skb->pkt_type != PACKET_HOST)
return 1;
+ /* If the packet is shorter than 12 bytes, drop packet and return */
if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: pskb_may_pull failed\n");
+ DCCP_WARN("pskb_may_pull failed\n");
return 1;
}
dh = dccp_hdr(skb);
- /* If the packet type is not understood, drop packet and return */
+ /* If P.type is not understood, drop packet and return */
if (dh->dccph_type >= DCCP_PKT_INVALID) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: invalid packet type\n");
+ DCCP_WARN("invalid packet type\n");
return 1;
}
/*
- * If P.Data Offset is too small for packet type, or too large for
- * packet, drop packet and return
+ * If P.Data Offset is too small for packet type, drop packet and return
*/
if (dh->dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
- "too small 1\n",
- dh->dccph_doff);
+ DCCP_WARN("P.Data Offset(%u) too small\n", dh->dccph_doff);
return 1;
}
-
+ /*
+ * If P.Data Offset is too too large for packet, drop packet and return
+ */
if (!pskb_may_pull(skb, dh->dccph_doff * sizeof(u32))) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.Data Offset(%u) "
- "too small 2\n",
- dh->dccph_doff);
+ DCCP_WARN("P.Data Offset(%u) too large\n", dh->dccph_doff);
return 1;
}
- dh = dccp_hdr(skb);
-
/*
* If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
* has short sequence numbers), drop packet and return
*/
- if (dh->dccph_x == 0 &&
- dh->dccph_type != DCCP_PKT_DATA &&
- dh->dccph_type != DCCP_PKT_ACK &&
- dh->dccph_type != DCCP_PKT_DATAACK) {
- LIMIT_NETDEBUG(KERN_WARNING "DCCP: P.type (%s) not Data, Ack "
- "nor DataAck and P.X == 0\n",
- dccp_packet_name(dh->dccph_type));
+ if ((dh->dccph_type < DCCP_PKT_DATA ||
+ dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) {
+ DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
+ dccp_packet_name(dh->dccph_type));
return 1;
}
+ /*
+ * If P.CsCov is too large for the packet size, drop packet and return.
+ * This must come _before_ checksumming (not as RFC 4340 suggests).
+ */
+ cscov = dccp_csum_coverage(skb);
+ if (cscov > skb->len) {
+ DCCP_WARN("P.CsCov %u exceeds packet length %d\n",
+ dh->dccph_cscov, skb->len);
+ return 1;
+ }
+
+ /* If header checksum is incorrect, drop packet and return.
+ * (This step is completed in the AF-dependent functions.) */
+ skb->csum = skb_checksum(skb, 0, cscov, 0);
+
return 0;
}
EXPORT_SYMBOL_GPL(dccp_invalid_packet);
/* this is called when real data arrives */
-int dccp_v4_rcv(struct sk_buff *skb)
+static int dccp_v4_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
+ const struct iphdr *iph;
struct sock *sk;
+ int min_cov;
- /* Step 1: Check header basics: */
+ /* Step 1: Check header basics */
if (dccp_invalid_packet(skb))
goto discard_it;
- /* If the header checksum is incorrect, drop packet and return */
- if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr,
- skb->nh.iph->daddr) < 0) {
- LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n",
- __FUNCTION__);
+ iph = ip_hdr(skb);
+ /* Step 1: If header checksum is incorrect, drop packet and return */
+ if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) {
+ DCCP_WARN("dropped packet with invalid checksum\n");
goto discard_it;
}
dh = dccp_hdr(skb);
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
- dccp_pr_debug("%8.8s "
- "src=%u.%u.%u.%u@%-5d "
- "dst=%u.%u.%u.%u@%-5d seq=%llu",
+ dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu",
dccp_packet_name(dh->dccph_type),
- NIPQUAD(skb->nh.iph->saddr), ntohs(dh->dccph_sport),
- NIPQUAD(skb->nh.iph->daddr), ntohs(dh->dccph_dport),
+ &iph->saddr, ntohs(dh->dccph_sport),
+ &iph->daddr, ntohs(dh->dccph_dport),
(unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
if (dccp_packet_without_ack(skb)) {
@@ -947,23 +841,17 @@ int dccp_v4_rcv(struct sk_buff *skb)
dccp_pr_debug_cat("\n");
} else {
DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
- dccp_pr_debug_cat(", ack=%llu\n",
- (unsigned long long)
+ dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
}
/* Step 2:
- * Look up flow ID in table and get corresponding socket */
- sk = __inet_lookup(&dccp_hashinfo,
- skb->nh.iph->saddr, dh->dccph_sport,
- skb->nh.iph->daddr, ntohs(dh->dccph_dport),
- inet_iif(skb));
-
- /*
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+ dh->dccph_sport, dh->dccph_dport);
+ /*
* Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
+ * If no socket ...
*/
if (sk == NULL) {
dccp_pr_debug("failed to look up flow ID in table and "
@@ -971,177 +859,100 @@ int dccp_v4_rcv(struct sk_buff *skb)
goto no_dccp_socket;
}
- /*
+ /*
* Step 2:
- * ... or S.state == TIMEWAIT,
+ * ... or S.state == TIMEWAIT,
* Generate Reset(No Connection) unless P.type == Reset
* Drop packet and return
*/
-
if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: "
- "do_time_wait\n");
- goto do_time_wait;
+ dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
+ inet_twsk_put(inet_twsk(sk));
+ goto no_dccp_socket;
+ }
+
+ /*
+ * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
+ * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
+ * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
+ */
+ min_cov = dccp_sk(sk)->dccps_pcrlen;
+ if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
+ dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
+ dh->dccph_cscov, min_cov);
+ /* FIXME: "Such packets SHOULD be reported using Data Dropped
+ * options (Section 11.7) with Drop Code 0, Protocol
+ * Constraints." */
+ goto discard_and_relse;
}
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
nf_reset(skb);
- return sk_receive_skb(sk, skb);
+ return sk_receive_skb(sk, skb, 1);
no_dccp_socket:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
/*
* Step 2:
+ * If no socket ...
* Generate Reset(No Connection) unless P.type == Reset
* Drop packet and return
*/
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v4_ctl_send_reset(skb);
+ dccp_v4_ctl_send_reset(sk, skb);
}
discard_it:
- /* Discard frame. */
kfree_skb(skb);
return 0;
discard_and_relse:
sock_put(sk);
goto discard_it;
-
-do_time_wait:
- inet_twsk_put((struct inet_timewait_sock *)sk);
- goto no_dccp_socket;
}
-struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v4_conn_request,
- .syn_recv_sock = dccp_v4_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ip_setsockopt,
- .getsockopt = ip_getsockopt,
- .addr2sockaddr = inet_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in),
+static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v4_conn_request,
+ .syn_recv_sock = dccp_v4_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ip_setsockopt,
+ .getsockopt = ip_getsockopt,
+ .addr2sockaddr = inet_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in),
+ .bind_conflict = inet_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ip_setsockopt,
+ .compat_getsockopt = compat_ip_getsockopt,
+#endif
};
-int dccp_v4_init_sock(struct sock *sk)
+static int dccp_v4_init_sock(struct sock *sk)
{
- struct dccp_sock *dp = dccp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
- static int dccp_ctl_socket_init = 1;
-
- dccp_options_init(&dp->dccps_options);
- do_gettimeofday(&dp->dccps_epoch);
+ static __u8 dccp_v4_ctl_sock_initialized;
+ int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
- if (dp->dccps_options.dccpo_send_ack_vector) {
- dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(DCCP_MAX_ACKVEC_LEN,
- GFP_KERNEL);
- if (dp->dccps_hc_rx_ackvec == NULL)
- return -ENOMEM;
+ if (err == 0) {
+ if (unlikely(!dccp_v4_ctl_sock_initialized))
+ dccp_v4_ctl_sock_initialized = 1;
+ inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
}
- /*
- * FIXME: We're hardcoding the CCID, and doing this at this point makes
- * the listening (master) sock get CCID control blocks, which is not
- * necessary, but for now, to not mess with the test userspace apps,
- * lets leave it here, later the real solution is to do this in a
- * setsockopt(CCIDs-I-want/accept). -acme
- */
- if (likely(!dccp_ctl_socket_init)) {
- dp->dccps_hc_rx_ccid = ccid_init(dp->dccps_options.dccpo_rx_ccid,
- sk);
- dp->dccps_hc_tx_ccid = ccid_init(dp->dccps_options.dccpo_tx_ccid,
- sk);
- if (dp->dccps_hc_rx_ccid == NULL ||
- dp->dccps_hc_tx_ccid == NULL) {
- ccid_exit(dp->dccps_hc_rx_ccid, sk);
- ccid_exit(dp->dccps_hc_tx_ccid, sk);
- if (dp->dccps_options.dccpo_send_ack_vector) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
- return -ENOMEM;
- }
- } else
- dccp_ctl_socket_init = 0;
-
- dccp_init_xmit_timers(sk);
- icsk->icsk_rto = DCCP_TIMEOUT_INIT;
- sk->sk_state = DCCP_CLOSED;
- sk->sk_write_space = dccp_write_space;
- icsk->icsk_af_ops = &dccp_ipv4_af_ops;
- icsk->icsk_sync_mss = dccp_sync_mss;
- dp->dccps_mss_cache = 536;
- dp->dccps_role = DCCP_ROLE_UNDEFINED;
- dp->dccps_service = DCCP_SERVICE_INVALID_VALUE;
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_init_sock);
-
-int dccp_v4_destroy_sock(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- /*
- * DCCP doesn't use sk_write_queue, just sk_send_head
- * for retransmissions
- */
- if (sk->sk_send_head != NULL) {
- kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- /* Clean up a referenced DCCP bind bucket. */
- if (inet_csk(sk)->icsk_bind_hash != NULL)
- inet_put_port(&dccp_hashinfo, sk);
-
- kfree(dp->dccps_service_list);
- dp->dccps_service_list = NULL;
-
- ccid_hc_rx_exit(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_exit(dp->dccps_hc_tx_ccid, sk);
- if (dp->dccps_options.dccpo_send_ack_vector) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- ccid_exit(dp->dccps_hc_rx_ccid, sk);
- ccid_exit(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
-
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock);
-
-static void dccp_v4_reqsk_destructor(struct request_sock *req)
-{
- kfree(inet_rsk(req)->opt);
+ return err;
}
-static struct request_sock_ops dccp_request_sock_ops = {
- .family = PF_INET,
- .obj_size = sizeof(struct dccp_request_sock),
- .rtx_syn_ack = dccp_v4_send_response,
- .send_ack = dccp_v4_reqsk_send_ack,
- .destructor = dccp_v4_reqsk_destructor,
- .send_reset = dccp_v4_ctl_send_reset,
-};
-
static struct timewait_sock_ops dccp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct inet_timewait_sock),
};
-struct proto dccp_prot = {
+static struct proto dccp_v4_prot = {
.name = "DCCP",
.owner = THIS_MODULE,
.close = dccp_close,
@@ -1154,17 +965,131 @@ struct proto dccp_prot = {
.sendmsg = dccp_sendmsg,
.recvmsg = dccp_recvmsg,
.backlog_rcv = dccp_v4_do_rcv,
- .hash = dccp_v4_hash,
- .unhash = dccp_unhash,
+ .hash = inet_hash,
+ .unhash = inet_unhash,
.accept = inet_csk_accept,
- .get_port = dccp_v4_get_port,
+ .get_port = inet_csk_get_port,
.shutdown = dccp_shutdown,
- .destroy = dccp_v4_destroy_sock,
+ .destroy = dccp_destroy_sock,
.orphan_count = &dccp_orphan_count,
.max_header = MAX_DCCP_HEADER,
.obj_size = sizeof(struct dccp_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
.rsk_prot = &dccp_request_sock_ops,
.twsk_prot = &dccp_timewait_sock_ops,
+ .h.hashinfo = &dccp_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_dccp_setsockopt,
+ .compat_getsockopt = compat_dccp_getsockopt,
+#endif
+};
+
+static const struct net_protocol dccp_v4_protocol = {
+ .handler = dccp_v4_rcv,
+ .err_handler = dccp_v4_err,
+ .no_policy = 1,
+ .netns_ok = 1,
+ .icmp_strict_tag_validation = 1,
+};
+
+static const struct proto_ops inet_dccp_ops = {
+ .family = PF_INET,
+ .owner = THIS_MODULE,
+ .release = inet_release,
+ .bind = inet_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet_getname,
+ /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
+ .poll = dccp_poll,
+ .ioctl = inet_ioctl,
+ /* FIXME: work on inet_listen to rename it to sock_common_listen */
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
+};
+
+static struct inet_protosw dccp_v4_protosw = {
+ .type = SOCK_DCCP,
+ .protocol = IPPROTO_DCCP,
+ .prot = &dccp_v4_prot,
+ .ops = &inet_dccp_ops,
+ .flags = INET_PROTOSW_ICSK,
+};
+
+static int __net_init dccp_v4_init_net(struct net *net)
+{
+ if (dccp_hashinfo.bhash == NULL)
+ return -ESOCKTNOSUPPORT;
+
+ return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET,
+ SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v4_exit_net(struct net *net)
+{
+ inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
+}
+
+static struct pernet_operations dccp_v4_ops = {
+ .init = dccp_v4_init_net,
+ .exit = dccp_v4_exit_net,
};
-EXPORT_SYMBOL_GPL(dccp_prot);
+static int __init dccp_v4_init(void)
+{
+ int err = proto_register(&dccp_v4_prot, 1);
+
+ if (err != 0)
+ goto out;
+
+ err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+ if (err != 0)
+ goto out_proto_unregister;
+
+ inet_register_protosw(&dccp_v4_protosw);
+
+ err = register_pernet_subsys(&dccp_v4_ops);
+ if (err)
+ goto out_destroy_ctl_sock;
+out:
+ return err;
+out_destroy_ctl_sock:
+ inet_unregister_protosw(&dccp_v4_protosw);
+ inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+out_proto_unregister:
+ proto_unregister(&dccp_v4_prot);
+ goto out;
+}
+
+static void __exit dccp_v4_exit(void)
+{
+ unregister_pernet_subsys(&dccp_v4_ops);
+ inet_unregister_protosw(&dccp_v4_protosw);
+ inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
+ proto_unregister(&dccp_v4_prot);
+}
+
+module_init(dccp_v4_init);
+module_exit(dccp_v4_exit);
+
+/*
+ * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
+ * values directly, Also cover the case where the protocol is not specified,
+ * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
+ */
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
+MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index df074259f9c..4db3c2a1679 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1,6 +1,6 @@
/*
* DCCP over IPv6
- * Linux INET6 implementation
+ * Linux INET6 implementation
*
* Based on net/dccp6/ipv6.c
*
@@ -12,9 +12,9 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/module.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/xfrm.h>
#include <net/addrconf.h>
@@ -29,322 +29,137 @@
#include <net/transp_v6.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/secure_seq.h>
#include "dccp.h"
#include "ipv6.h"
+#include "feat.h"
-static void dccp_v6_ctl_send_reset(struct sk_buff *skb);
-static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
- struct request_sock *req);
-static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb);
+/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */
-static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
-
-static struct inet_connection_sock_af_ops dccp_ipv6_mapped;
-static struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
-
-static int dccp_v6_get_port(struct sock *sk, unsigned short snum)
-{
- return inet_csk_get_port(&dccp_hashinfo, sk, snum,
- inet6_csk_bind_conflict);
-}
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
static void dccp_v6_hash(struct sock *sk)
{
if (sk->sk_state != DCCP_CLOSED) {
if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) {
- dccp_prot.hash(sk);
+ inet_hash(sk);
return;
}
local_bh_disable();
- __inet6_hash(&dccp_hashinfo, sk);
+ __inet6_hash(sk, NULL);
local_bh_enable();
}
}
-static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len,
- struct in6_addr *saddr,
- struct in6_addr *daddr,
- unsigned long base)
+/* add pseudo-header to DCCP checksum stored in skb->csum */
+static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr)
{
- return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base);
+ return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
}
-static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb)
+static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
{
- const struct dccp_hdr *dh = dccp_hdr(skb);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_hdr *dh = dccp_hdr(skb);
- if (skb->protocol == htons(ETH_P_IPV6))
- return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32,
- skb->nh.ipv6h->saddr.s6_addr32,
- dh->dccph_dport,
- dh->dccph_sport);
- else
- return secure_dccp_sequence_number(skb->nh.iph->daddr,
- skb->nh.iph->saddr,
- dh->dccph_dport,
- dh->dccph_sport);
+ dccp_csum_outgoing(skb);
+ dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
}
-static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len)
+static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
{
- struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p = NULL, final;
- struct flowi fl;
- struct dst_entry *dst;
- int addr_type;
- int err;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
-
- if (usin->sin6_family != AF_INET6)
- return -EAFNOSUPPORT;
-
- memset(&fl, 0, sizeof(fl));
-
- if (np->sndflow) {
- fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl.fl6_flowlabel);
- if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) {
- struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel);
- if (flowlabel == NULL)
- return -EINVAL;
- ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst);
- fl6_sock_release(flowlabel);
- }
- }
-
- /*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
-
- if (ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 0x1;
-
- addr_type = ipv6_addr_type(&usin->sin6_addr);
-
- if(addr_type & IPV6_ADDR_MULTICAST)
- return -ENETUNREACH;
-
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (addr_len >= sizeof(struct sockaddr_in6) &&
- usin->sin6_scope_id) {
- /* If interface is set while binding, indices
- * must coincide.
- */
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != usin->sin6_scope_id)
- return -EINVAL;
-
- sk->sk_bound_dev_if = usin->sin6_scope_id;
- }
-
- /* Connect to link-local address requires an interface */
- if (!sk->sk_bound_dev_if)
- return -EINVAL;
- }
-
- ipv6_addr_copy(&np->daddr, &usin->sin6_addr);
- np->flow_label = fl.fl6_flowlabel;
-
- /*
- * DCCP over IPv4
- */
-
- if (addr_type == IPV6_ADDR_MAPPED) {
- u32 exthdrlen = icsk->icsk_ext_hdr_len;
- struct sockaddr_in sin;
-
- SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
- if (__ipv6_only_sock(sk))
- return -ENETUNREACH;
-
- sin.sin_family = AF_INET;
- sin.sin_port = usin->sin6_port;
- sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
-
- icsk->icsk_af_ops = &dccp_ipv6_mapped;
- sk->sk_backlog_rcv = dccp_v4_do_rcv;
-
- err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
-
- if (err) {
- icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &dccp_ipv6_af_ops;
- sk->sk_backlog_rcv = dccp_v6_do_rcv;
- goto failure;
- } else {
- ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF),
- inet->saddr);
- ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF),
- inet->rcv_saddr);
- }
-
- return err;
- }
-
- if (!ipv6_addr_any(&np->rcv_saddr))
- saddr = &np->rcv_saddr;
-
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = usin->sin6_port;
- fl.fl_ip_sport = inet->sport;
-
- if (np->opt && np->opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
-
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto failure;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto failure;
-
- if (saddr == NULL) {
- saddr = &fl.fl6_src;
- ipv6_addr_copy(&np->rcv_saddr, saddr);
- }
-
- /* set the source address */
- ipv6_addr_copy(&np->saddr, saddr);
- inet->rcv_saddr = LOOPBACK4_IPV6;
-
- ip6_dst_store(sk, dst, NULL);
+ return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
+ ipv6_hdr(skb)->saddr.s6_addr32,
+ dccp_hdr(skb)->dccph_dport,
+ dccp_hdr(skb)->dccph_sport );
- icsk->icsk_ext_hdr_len = 0;
- if (np->opt)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
-
- inet->dport = usin->sin6_port;
-
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet6_hash_connect(&dccp_death_row, sk);
- if (err)
- goto late_failure;
- /* FIXME */
-#if 0
- dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32,
- np->daddr.s6_addr32,
- inet->sport,
- inet->dport);
-#endif
- err = dccp_connect(sk);
- if (err)
- goto late_failure;
-
- return 0;
-
-late_failure:
- dccp_set_state(sk, DCCP_CLOSED);
- __sk_dst_reset(sk);
-failure:
- inet->dport = 0;
- sk->sk_route_caps = 0;
- return err;
}
static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- int type, int code, int offset, __u32 info)
+ u8 type, u8 code, int offset, __be32 info)
{
- struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data;
+ const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset);
+ struct dccp_sock *dp;
struct ipv6_pinfo *np;
struct sock *sk;
int err;
__u64 seq;
+ struct net *net = dev_net(skb->dev);
+
+ if (skb->len < offset + sizeof(*dh) ||
+ skb->len < offset + __dccp_basic_hdr_len(dh)) {
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return;
+ }
- sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport,
- &hdr->saddr, dh->dccph_sport, skb->dev->ifindex);
+ sk = inet6_lookup(net, &dccp_hashinfo,
+ &hdr->daddr, dh->dccph_dport,
+ &hdr->saddr, dh->dccph_sport, inet6_iif(skb));
if (sk == NULL) {
- ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
return;
}
if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put((struct inet_timewait_sock *)sk);
+ inet_twsk_put(inet_twsk(sk));
return;
}
bh_lock_sock(sk);
if (sock_owned_by_user(sk))
- NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+ NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == DCCP_CLOSED)
goto out;
+ dp = dccp_sk(sk);
+ seq = dccp_hdr_seq(dh);
+ if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
+ !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
+ goto out;
+ }
+
np = inet6_sk(sk);
+ if (type == NDISC_REDIRECT) {
+ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+
+ if (dst)
+ dst->ops->redirect(dst, sk, skb);
+ goto out;
+ }
+
if (type == ICMPV6_PKT_TOOBIG) {
struct dst_entry *dst = NULL;
+ if (!ip6_sk_accept_pmtu(sk))
+ goto out;
+
if (sock_owned_by_user(sk))
goto out;
if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
goto out;
- /* icmp should have updated the destination cache entry */
- dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst == NULL) {
- struct inet_sock *inet = inet_sk(sk);
- struct flowi fl;
-
- /* BUGGG_FUTURE: Again, it is not clear how
- to handle rthdr case. Ignore this complexity
- for now.
- */
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &np->daddr);
- ipv6_addr_copy(&fl.fl6_src, &np->saddr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet->dport;
- fl.fl_ip_sport = inet->sport;
-
- if ((err = ip6_dst_lookup(sk, &dst, &fl))) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) {
- sk->sk_err_soft = -err;
- goto out;
- }
-
- } else
- dst_hold(dst);
+ dst = inet6_csk_update_pmtu(sk, ntohl(info));
+ if (!dst)
+ goto out;
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
+ if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
dccp_sync_mss(sk, dst_mtu(dst));
- } /* else let the usual retransmit timer handle it */
- dst_release(dst);
goto out;
}
icmpv6_err_convert(type, code, &err);
- seq = DCCP_SKB_CB(skb)->dccpd_seq;
/* Might be for an request_sock */
switch (sk->sk_state) {
struct request_sock *req, **prev;
@@ -355,16 +170,18 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
req = inet6_csk_search_req(sk, &prev, dh->dccph_dport,
&hdr->daddr, &hdr->saddr,
inet6_iif(skb));
- if (!req)
+ if (req == NULL)
goto out;
- /* ICMPs are not backlogged, hence we cannot get
- * an established socket here.
+ /*
+ * ICMPs are not backlogged, hence we cannot get an established
+ * socket here.
*/
- BUG_TRAP(req->sk == NULL);
+ WARN_ON(req->sk != NULL);
- if (seq != dccp_rsk(req)->dreq_iss) {
- NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+ if (!between48(seq, dccp_rsk(req)->dreq_iss,
+ dccp_rsk(req)->dreq_gss)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
@@ -373,7 +190,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
case DCCP_REQUESTING:
case DCCP_RESPOND: /* Cannot happen.
- It can, it SYNs are crossed. --ANK */
+ It can, it SYNs are crossed. --ANK */
if (!sock_owned_by_user(sk)) {
DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
sk->sk_err = err;
@@ -382,7 +199,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
* (see connect in sock.c)
*/
sk->sk_error_report(sk);
-
dccp_done(sk);
} else
sk->sk_err_soft = err;
@@ -401,246 +217,119 @@ out:
}
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
- struct ipv6_txoptions *opt = NULL;
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
int err = -1;
+ struct dst_entry *dst;
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.fl6_flowlabel = 0;
- fl.oif = ireq6->iif;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowlabel = 0;
+ fl6.flowi6_oif = ireq->ir_iif;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_req_classify_flow(req, flowi6_to_flowi(&fl6));
- if (dst == NULL) {
- opt = np->opt;
- if (opt == NULL &&
- np->rxopt.bits.osrcrt == 2 &&
- ireq6->pktopts) {
- struct sk_buff *pktopts = ireq6->pktopts;
- struct inet6_skb_parm *rxopt = IP6CB(pktopts);
- if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk,
- (struct ipv6_rt_hdr *)(pktopts->nh.raw +
- rxopt->srcrt));
- }
- if (opt && opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
- err = ip6_dst_lookup(sk, &dst, &fl);
- if (err)
- goto done;
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
- if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0)
- goto done;
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto done;
}
skb = dccp_make_response(sk, dst, req);
if (skb != NULL) {
struct dccp_hdr *dh = dccp_hdr(skb);
- dh->dccph_checksum = dccp_v6_check(dh, skb->len,
- &ireq6->loc_addr,
- &ireq6->rmt_addr,
- csum_partial((char *)dh,
- skb->len,
- skb->csum));
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- err = ip6_xmit(sk, skb, &fl, opt, 0);
- if (err == NET_XMIT_CN)
- err = 0;
+
+ dh->dccph_checksum = dccp_v6_csum_finish(skb,
+ &ireq->ir_v6_loc_addr,
+ &ireq->ir_v6_rmt_addr);
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ err = net_xmit_eval(err);
}
done:
- if (opt && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ dst_release(dst);
return err;
}
static void dccp_v6_reqsk_destructor(struct request_sock *req)
{
- if (inet6_rsk(req)->pktopts != NULL)
- kfree_skb(inet6_rsk(req)->pktopts);
+ dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+ kfree_skb(inet_rsk(req)->pktopts);
}
-static struct request_sock_ops dccp6_request_sock_ops = {
- .family = AF_INET6,
- .obj_size = sizeof(struct dccp6_request_sock),
- .rtx_syn_ack = dccp_v6_send_response,
- .send_ack = dccp_v6_reqsk_send_ack,
- .destructor = dccp_v6_reqsk_destructor,
- .send_reset = dccp_v6_ctl_send_reset,
-};
-
-static struct timewait_sock_ops dccp6_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
-};
-
-static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb)
+static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr,
- len, IPPROTO_DCCP,
- csum_partial((char *)dh,
- dh->dccph_doff << 2,
- skb->csum));
-}
-
-static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb)
-{
- struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
- const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
+ const struct ipv6hdr *rxip6h;
struct sk_buff *skb;
- struct flowi fl;
- u64 seqno;
+ struct flowi6 fl6;
+ struct net *net = dev_net(skb_dst(rxskb)->dev);
+ struct sock *ctl_sk = net->dccp.v6_ctl_sk;
+ struct dst_entry *dst;
- if (rxdh->dccph_type == DCCP_PKT_RESET)
+ if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
return;
if (!ipv6_unicast_destination(rxskb))
- return;
+ return;
- /*
- * We need to grab some memory, and put together an RST,
- * and then put it into the queue to be sent.
- */
+ skb = dccp_ctl_make_reset(ctl_sk, rxskb);
+ if (skb == NULL)
+ return;
- skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
- dccp_hdr_reset_len, GFP_ATOMIC);
- if (skb == NULL)
- return;
+ rxip6h = ipv6_hdr(rxskb);
+ dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
+ &rxip6h->daddr);
- skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
- dccp_hdr_reset_len);
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = rxip6h->saddr;
+ fl6.saddr = rxip6h->daddr;
- skb->h.raw = skb_push(skb, dccp_hdr_reset_len);
- dh = dccp_hdr(skb);
- memset(dh, 0, dccp_hdr_reset_len);
-
- /* Swap the send and the receive. */
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_reset_len / 4;
- dh->dccph_x = 1;
- dccp_hdr_reset(skb)->dccph_reset_code =
- DCCP_SKB_CB(rxskb)->dccpd_reset_code;
-
- /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */
- seqno = 0;
- if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1);
-
- dccp_hdr_set_seq(dh, seqno);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
- DCCP_SKB_CB(rxskb)->dccpd_seq);
-
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
- dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst,
- sizeof(*dh), IPPROTO_DCCP,
- skb->csum);
- fl.proto = IPPROTO_DCCP;
- fl.oif = inet6_iif(rxskb);
- fl.fl_ip_dport = dh->dccph_dport;
- fl.fl_ip_sport = dh->dccph_sport;
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.flowi6_oif = inet6_iif(rxskb);
+ fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
+ fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
+ security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
/* sk = NULL, but it is safe for now. RST socket required. */
- if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
- if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
- ip6_xmit(NULL, skb, &fl, NULL, 0);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
- return;
- }
- }
-
- kfree_skb(skb);
-}
-
-static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb)
-{
- struct flowi fl;
- struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh;
- const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_ack_bits);
- struct sk_buff *skb;
-
- skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) +
- dccp_hdr_ack_len, GFP_ATOMIC);
- if (skb == NULL)
+ dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ if (!IS_ERR(dst)) {
+ skb_dst_set(skb, dst);
+ ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
+ DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS);
return;
-
- skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) +
- dccp_hdr_ack_len);
-
- skb->h.raw = skb_push(skb, dccp_hdr_ack_len);
- dh = dccp_hdr(skb);
- memset(dh, 0, dccp_hdr_ack_len);
-
- /* Build DCCP header and checksum it. */
- dh->dccph_type = DCCP_PKT_ACK;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_ack_len / 4;
- dh->dccph_x = 1;
-
- dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb),
- DCCP_SKB_CB(rxskb)->dccpd_seq);
-
- memset(&fl, 0, sizeof(fl));
- ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr);
- ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr);
-
- /* FIXME: calculate checksum, IPv4 also should... */
-
- fl.proto = IPPROTO_DCCP;
- fl.oif = inet6_iif(rxskb);
- fl.fl_ip_dport = dh->dccph_dport;
- fl.fl_ip_sport = dh->dccph_sport;
-
- if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) {
- if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) {
- ip6_xmit(NULL, skb, &fl, NULL, 0);
- DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS);
- return;
- }
}
kfree_skb(skb);
}
-static void dccp_v6_reqsk_send_ack(struct sk_buff *skb,
- struct request_sock *req)
-{
- dccp_v6_ctl_send_ack(skb);
-}
+static struct request_sock_ops dccp6_request_sock_ops = {
+ .family = AF_INET6,
+ .obj_size = sizeof(struct dccp6_request_sock),
+ .rtx_syn_ack = dccp_v6_send_response,
+ .send_ack = dccp_reqsk_send_ack,
+ .destructor = dccp_v6_reqsk_destructor,
+ .send_reset = dccp_v6_ctl_send_reset,
+ .syn_ack_timeout = dccp_syn_ack_timeout,
+};
static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
- const struct ipv6hdr *iph = skb->nh.ipv6h;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
struct sock *nsk;
struct request_sock **prev;
/* Find possible connection requests. */
@@ -652,17 +341,16 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
if (req != NULL)
return dccp_check_req(sk, skb, req, prev);
- nsk = __inet6_lookup_established(&dccp_hashinfo,
+ nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
&iph->saddr, dh->dccph_sport,
&iph->daddr, ntohs(dh->dccph_dport),
inet6_iif(skb));
-
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
- inet_twsk_put((struct inet_timewait_sock *)nsk);
+ inet_twsk_put(inet_twsk(nsk));
return NULL;
}
@@ -671,78 +359,78 @@ static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct inet_request_sock *ireq;
- struct dccp_sock dp;
struct request_sock *req;
struct dccp_request_sock *dreq;
- struct inet6_request_sock *ireq6;
+ struct inet_request_sock *ireq;
struct ipv6_pinfo *np = inet6_sk(sk);
- const __u32 service = dccp_hdr_request(skb)->dccph_req_service;
+ const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY;
if (skb->protocol == htons(ETH_P_IP))
return dccp_v4_conn_request(sk, skb);
if (!ipv6_unicast_destination(skb))
- goto drop;
+ return 0; /* discard, don't send a reset here */
if (dccp_bad_service_code(sk, service)) {
- reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
goto drop;
- }
+ }
/*
- * There are no SYN attacks on IPv6, yet...
+ * There are no SYN attacks on IPv6, yet...
*/
+ dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
+ goto drop;
if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
goto drop;
- req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot);
+ req = inet6_reqsk_alloc(&dccp6_request_sock_ops);
if (req == NULL)
goto drop;
- /* FIXME: process options */
+ if (dccp_reqsk_init(req, dccp_sk(sk), skb))
+ goto drop_and_free;
- dccp_openreq_init(req, &dp, skb);
+ dreq = dccp_rsk(req);
+ if (dccp_parse_options(sk, dreq, skb))
+ goto drop_and_free;
+
+ if (security_inet_conn_request(sk, skb, req))
+ goto drop_and_free;
- ireq6 = inet6_rsk(req);
ireq = inet_rsk(req);
- ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr);
- ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr);
- req->rcv_wnd = 100; /* Fake, option parsing will get the
- right value */
- ireq6->pktopts = NULL;
+ ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+ ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
if (ipv6_opt_accepted(sk, skb) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
atomic_inc(&skb->users);
- ireq6->pktopts = skb;
+ ireq->pktopts = skb;
}
- ireq6->iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = sk->sk_bound_dev_if;
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL)
- ireq6->iif = inet6_iif(skb);
+ ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
+ ireq->ir_iif = inet6_iif(skb);
- /*
+ /*
* Step 3: Process LISTEN state
*
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
*
- * In fact we defer setting S.GSR, S.SWL, S.SWH to
- * dccp_create_openreq_child.
+ * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
*/
- dreq = dccp_rsk(req);
dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_iss = dccp_v6_init_sequence(sk, skb);
+ dreq->dreq_gsr = dreq->dreq_isr;
+ dreq->dreq_iss = dccp_v6_init_sequence(skb);
+ dreq->dreq_gss = dreq->dreq_iss;
dreq->dreq_service = service;
- if (dccp_v6_send_response(sk, req, NULL))
+ if (dccp_v6_send_response(sk, req))
goto drop_and_free;
inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
@@ -752,7 +440,6 @@ drop_and_free:
reqsk_free(req);
drop:
DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS);
- dcb->dccpd_reset_code = reset_code;
return -1;
}
@@ -761,45 +448,39 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
struct request_sock *req,
struct dst_entry *dst)
{
- struct inet6_request_sock *ireq6 = inet6_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct inet_sock *newinet;
- struct dccp_sock *newdp;
struct dccp6_sock *newdp6;
struct sock *newsk;
- struct ipv6_txoptions *opt;
if (skb->protocol == htons(ETH_P_IP)) {
/*
* v6 mapped
*/
-
newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
- if (newsk == NULL)
+ if (newsk == NULL)
return NULL;
newdp6 = (struct dccp6_sock *)newsk;
- newdp = dccp_sk(newsk);
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF),
- newinet->daddr);
+ ipv6_addr_set_v4mapped(newinet->inet_daddr, &newsk->sk_v6_daddr);
- ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF),
- newinet->saddr);
+ ipv6_addr_set_v4mapped(newinet->inet_saddr, &newnp->saddr);
- ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr);
+ newsk->sk_v6_rcv_saddr = newnp->saddr;
inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
newsk->sk_backlog_rcv = dccp_v4_do_rcv;
newnp->pktoptions = NULL;
newnp->opt = NULL;
newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks count
@@ -816,51 +497,32 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
return newsk;
}
- opt = np->opt;
if (sk_acceptq_is_full(sk))
goto out_overflow;
- if (np->rxopt.bits.osrcrt == 2 &&
- opt == NULL && ireq6->pktopts) {
- struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts);
- if (rxopt->srcrt)
- opt = ipv6_invert_rthdr(sk,
- (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw +
- rxopt->srcrt));
- }
-
if (dst == NULL) {
- struct in6_addr *final_p = NULL, final;
- struct flowi fl;
-
- memset(&fl, 0, sizeof(fl));
- fl.proto = IPPROTO_DCCP;
- ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr);
- if (opt && opt->srcrt) {
- struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt;
- ipv6_addr_copy(&final, &fl.fl6_dst);
- ipv6_addr_copy(&fl.fl6_dst, rt0->addr);
- final_p = &final;
- }
- ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr);
- fl.oif = sk->sk_bound_dev_if;
- fl.fl_ip_dport = inet_rsk(req)->rmt_port;
- fl.fl_ip_sport = inet_sk(sk)->sport;
-
- if (ip6_dst_lookup(sk, &dst, &fl))
- goto out;
-
- if (final_p)
- ipv6_addr_copy(&fl.fl6_dst, final_p);
-
- if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0)
+ struct in6_addr *final_p, final;
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = ireq->ir_v6_rmt_addr;
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+ fl6.saddr = ireq->ir_v6_loc_addr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = ireq->ir_rmt_port;
+ fl6.fl6_sport = htons(ireq->ir_num);
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst))
goto out;
- }
+ }
newsk = dccp_create_openreq_child(sk, req, skb);
if (newsk == NULL)
- goto out;
+ goto out_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -868,78 +530,77 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
* comment in that function for the gory details. -acme
*/
- ip6_dst_store(newsk, dst, NULL);
- newsk->sk_route_caps = dst->dev->features &
- ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
-
+ __ip6_dst_store(newsk, dst, NULL, NULL);
+ newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
+ NETIF_F_TSO);
newdp6 = (struct dccp6_sock *)newsk;
newinet = inet_sk(newsk);
newinet->pinet6 = &newdp6->inet6;
- newdp = dccp_sk(newsk);
newnp = inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr);
- ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr);
- ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr);
- newsk->sk_bound_dev_if = ireq6->iif;
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newnp->saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
- /* Now IPv6 options...
+ /* Now IPv6 options...
First: no IPv4 options.
*/
- newinet->opt = NULL;
+ newinet->inet_opt = NULL;
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
/* Clone pktoptions received with SYN */
newnp->pktoptions = NULL;
- if (ireq6->pktopts != NULL) {
- newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC);
- kfree_skb(ireq6->pktopts);
- ireq6->pktopts = NULL;
+ if (ireq->pktopts != NULL) {
+ newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
if (newnp->pktoptions)
skb_set_owner_r(newnp->pktoptions, newsk);
}
newnp->opt = NULL;
newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = skb->nh.ipv6h->hop_limit;
+ newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
- /* Clone native IPv6 options from listening socket (if any)
-
- Yes, keeping reference count would be much more clever,
- but we make one more one thing there: reattach optmem
- to newsk.
+ /*
+ * Clone native IPv6 options from listening socket (if any)
+ *
+ * Yes, keeping reference count would be much more clever, but we make
+ * one more one thing there: reattach optmem to newsk.
*/
- if (opt) {
- newnp->opt = ipv6_dup_options(newsk, opt);
- if (opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
- }
+ if (np->opt != NULL)
+ newnp->opt = ipv6_dup_options(newsk, np->opt);
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt)
+ if (newnp->opt != NULL)
inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
newnp->opt->opt_flen);
dccp_sync_mss(newsk, dst_mtu(dst));
- newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6;
+ newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
+ newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
- __inet6_hash(&dccp_hashinfo, newsk);
- inet_inherit_port(&dccp_hashinfo, sk, newsk);
+ if (__inet_inherit_port(sk, newsk) < 0) {
+ inet_csk_prepare_forced_close(newsk);
+ dccp_done(newsk);
+ goto out;
+ }
+ __inet6_hash(newsk, NULL);
return newsk;
out_overflow:
- NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
-out:
- NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
- if (opt && opt != np->opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
dst_release(dst);
+out:
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
return NULL;
}
@@ -967,12 +628,12 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return dccp_v4_do_rcv(sk, skb);
- if (sk_filter(sk, skb, 0))
+ if (sk_filter(sk, skb))
goto discard;
/*
- * socket locking is here for SMP purposes as backlog rcv
- * is currently called with bh processing disabled.
+ * socket locking is here for SMP purposes as backlog rcv is currently
+ * called with bh processing disabled.
*/
/* Do Stevens' IPV6_PKTOPTIONS.
@@ -986,31 +647,63 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
looks not very well thought. For now we latch
options, received in the last packet, enqueued
by tcp. Feel free to propose better solution.
- --ANK (980728)
+ --ANK (980728)
*/
if (np->rxopt.all)
+ /*
+ * FIXME: Add handling of IPV6_PKTOPTIONS skb. See the comments below
+ * (wrt ipv6_pktopions) and net/ipv6/tcp_ipv6.c for an example.
+ */
opt_skb = skb_clone(skb, GFP_ATOMIC);
if (sk->sk_state == DCCP_OPEN) { /* Fast path */
if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
goto reset;
+ if (opt_skb) {
+ /* XXX This is where we would goto ipv6_pktoptions. */
+ __kfree_skb(opt_skb);
+ }
return 0;
}
- if (sk->sk_state == DCCP_LISTEN) {
+ /*
+ * Step 3: Process LISTEN state
+ * If S.state == LISTEN,
+ * If P.type == Request or P contains a valid Init Cookie option,
+ * (* Must scan the packet's options to check for Init
+ * Cookies. Only Init Cookies are processed here,
+ * however; other options are processed in Step 8. This
+ * scan need only be performed if the endpoint uses Init
+ * Cookies *)
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
+ * S.state = RESPOND
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
+ * Continue with S.state == RESPOND
+ * (* A Response packet will be generated in Step 11 *)
+ * Otherwise,
+ * Generate Reset(No Connection) unless P.type == Reset
+ * Drop packet and return
+ *
+ * NOTE: the check for the packet types is done in
+ * dccp_rcv_state_process
+ */
+ if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v6_hnd_req(sk, skb);
- if (!nsk)
- goto discard;
+ if (nsk == NULL)
+ goto discard;
/*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
- if(nsk != sk) {
+ if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
- if (opt_skb)
+ if (opt_skb != NULL)
__kfree_skb(opt_skb);
return 0;
}
@@ -1018,31 +711,42 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
goto reset;
+ if (opt_skb) {
+ /* XXX This is where we would goto ipv6_pktoptions. */
+ __kfree_skb(opt_skb);
+ }
return 0;
reset:
- dccp_v6_ctl_send_reset(skb);
+ dccp_v6_ctl_send_reset(sk, skb);
discard:
- if (opt_skb)
+ if (opt_skb != NULL)
__kfree_skb(opt_skb);
kfree_skb(skb);
return 0;
}
-static int dccp_v6_rcv(struct sk_buff **pskb)
+static int dccp_v6_rcv(struct sk_buff *skb)
{
const struct dccp_hdr *dh;
- struct sk_buff *skb = *pskb;
struct sock *sk;
+ int min_cov;
- /* Step 1: Check header basics: */
+ /* Step 1: Check header basics */
if (dccp_invalid_packet(skb))
goto discard_it;
+ /* Step 1: If header checksum is incorrect, drop packet and return. */
+ if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr,
+ &ipv6_hdr(skb)->daddr)) {
+ DCCP_WARN("dropped packet with invalid checksum\n");
+ goto discard_it;
+ }
+
dh = dccp_hdr(skb);
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb);
+ DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
if (dccp_packet_without_ack(skb))
@@ -1051,93 +755,269 @@ static int dccp_v6_rcv(struct sk_buff **pskb)
DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
/* Step 2:
- * Look up flow ID in table and get corresponding socket */
- sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr,
- dh->dccph_sport,
- &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport),
- inet6_iif(skb));
- /*
+ * Look up flow ID in table and get corresponding socket */
+ sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+ dh->dccph_sport, dh->dccph_dport);
+ /*
* Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
+ * If no socket ...
*/
- if (sk == NULL)
+ if (sk == NULL) {
+ dccp_pr_debug("failed to look up flow ID in table and "
+ "get corresponding socket\n");
goto no_dccp_socket;
+ }
- /*
+ /*
* Step 2:
- * ... or S.state == TIMEWAIT,
+ * ... or S.state == TIMEWAIT,
* Generate Reset(No Connection) unless P.type == Reset
* Drop packet and return
*/
-
- if (sk->sk_state == DCCP_TIME_WAIT)
- goto do_time_wait;
+ if (sk->sk_state == DCCP_TIME_WAIT) {
+ dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
+ inet_twsk_put(inet_twsk(sk));
+ goto no_dccp_socket;
+ }
+
+ /*
+ * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
+ * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
+ * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
+ */
+ min_cov = dccp_sk(sk)->dccps_pcrlen;
+ if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
+ dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
+ dh->dccph_cscov, min_cov);
+ /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */
+ goto discard_and_relse;
+ }
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
- return sk_receive_skb(sk, skb) ? -1 : 0;
+ return sk_receive_skb(sk, skb, 1) ? -1 : 0;
no_dccp_socket:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
/*
* Step 2:
+ * If no socket ...
* Generate Reset(No Connection) unless P.type == Reset
* Drop packet and return
*/
if (dh->dccph_type != DCCP_PKT_RESET) {
DCCP_SKB_CB(skb)->dccpd_reset_code =
DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v6_ctl_send_reset(skb);
+ dccp_v6_ctl_send_reset(sk, skb);
}
-discard_it:
-
- /*
- * Discard frame
- */
+discard_it:
kfree_skb(skb);
return 0;
discard_and_relse:
sock_put(sk);
goto discard_it;
+}
+
+static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+ int addr_len)
+{
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p, final;
+ struct flowi6 fl6;
+ struct dst_entry *dst;
+ int addr_type;
+ int err;
+
+ dp->dccps_role = DCCP_ROLE_CLIENT;
-do_time_wait:
- inet_twsk_put((struct inet_timewait_sock *)sk);
- goto no_dccp_socket;
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ if (usin->sin6_family != AF_INET6)
+ return -EAFNOSUPPORT;
+
+ memset(&fl6, 0, sizeof(fl6));
+
+ if (np->sndflow) {
+ fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6.flowlabel);
+ if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
+ struct ip6_flowlabel *flowlabel;
+ flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ if (flowlabel == NULL)
+ return -EINVAL;
+ fl6_sock_release(flowlabel);
+ }
+ }
+ /*
+ * connect() to INADDR_ANY means loopback (BSD'ism).
+ */
+ if (ipv6_addr_any(&usin->sin6_addr))
+ usin->sin6_addr.s6_addr[15] = 1;
+
+ addr_type = ipv6_addr_type(&usin->sin6_addr);
+
+ if (addr_type & IPV6_ADDR_MULTICAST)
+ return -ENETUNREACH;
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (addr_len >= sizeof(struct sockaddr_in6) &&
+ usin->sin6_scope_id) {
+ /* If interface is set while binding, indices
+ * must coincide.
+ */
+ if (sk->sk_bound_dev_if &&
+ sk->sk_bound_dev_if != usin->sin6_scope_id)
+ return -EINVAL;
+
+ sk->sk_bound_dev_if = usin->sin6_scope_id;
+ }
+
+ /* Connect to link-local address requires an interface */
+ if (!sk->sk_bound_dev_if)
+ return -EINVAL;
+ }
+
+ sk->sk_v6_daddr = usin->sin6_addr;
+ np->flow_label = fl6.flowlabel;
+
+ /*
+ * DCCP over IPv4
+ */
+ if (addr_type == IPV6_ADDR_MAPPED) {
+ u32 exthdrlen = icsk->icsk_ext_hdr_len;
+ struct sockaddr_in sin;
+
+ SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
+
+ if (__ipv6_only_sock(sk))
+ return -ENETUNREACH;
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = usin->sin6_port;
+ sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
+
+ icsk->icsk_af_ops = &dccp_ipv6_mapped;
+ sk->sk_backlog_rcv = dccp_v4_do_rcv;
+
+ err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+ if (err) {
+ icsk->icsk_ext_hdr_len = exthdrlen;
+ icsk->icsk_af_ops = &dccp_ipv6_af_ops;
+ sk->sk_backlog_rcv = dccp_v6_do_rcv;
+ goto failure;
+ }
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &np->saddr);
+ ipv6_addr_set_v4mapped(inet->inet_rcv_saddr, &sk->sk_v6_rcv_saddr);
+
+ return err;
+ }
+
+ if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ saddr = &sk->sk_v6_rcv_saddr;
+
+ fl6.flowi6_proto = IPPROTO_DCCP;
+ fl6.daddr = sk->sk_v6_daddr;
+ fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.fl6_dport = usin->sin6_port;
+ fl6.fl6_sport = inet->inet_sport;
+ security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+
+ final_p = fl6_update_dst(&fl6, np->opt, &final);
+
+ dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto failure;
+ }
+
+ if (saddr == NULL) {
+ saddr = &fl6.saddr;
+ sk->sk_v6_rcv_saddr = *saddr;
+ }
+
+ /* set the source address */
+ np->saddr = *saddr;
+ inet->inet_rcv_saddr = LOOPBACK4_IPV6;
+
+ __ip6_dst_store(sk, dst, NULL, NULL);
+
+ icsk->icsk_ext_hdr_len = 0;
+ if (np->opt != NULL)
+ icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
+ np->opt->opt_nflen);
+
+ inet->inet_dport = usin->sin6_port;
+
+ dccp_set_state(sk, DCCP_REQUESTING);
+ err = inet6_hash_connect(&dccp_death_row, sk);
+ if (err)
+ goto late_failure;
+
+ dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport);
+ err = dccp_connect(sk);
+ if (err)
+ goto late_failure;
+
+ return 0;
+
+late_failure:
+ dccp_set_state(sk, DCCP_CLOSED);
+ __sk_dst_reset(sk);
+failure:
+ inet->inet_dport = 0;
+ sk->sk_route_caps = 0;
+ return err;
}
-static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
- .queue_xmit = inet6_csk_xmit,
- .send_check = dccp_v6_send_check,
- .rebuild_header = inet6_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct ipv6hdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6)
+static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
+ .queue_xmit = inet6_csk_xmit,
+ .send_check = dccp_v6_send_check,
+ .rebuild_header = inet6_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct ipv6hdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+ .bind_conflict = inet6_csk_bind_conflict,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
};
/*
* DCCP over IPv4 via INET6 API
*/
-static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6)
+static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
+ .queue_xmit = ip_queue_xmit,
+ .send_check = dccp_v4_send_check,
+ .rebuild_header = inet_sk_rebuild_header,
+ .conn_request = dccp_v6_conn_request,
+ .syn_recv_sock = dccp_v6_request_recv_sock,
+ .net_header_len = sizeof(struct iphdr),
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .addr2sockaddr = inet6_csk_addr2sockaddr,
+ .sockaddr_len = sizeof(struct sockaddr_in6),
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_ipv6_setsockopt,
+ .compat_getsockopt = compat_ipv6_getsockopt,
+#endif
};
/* NOTE: A lot of things set to zero explicitly by call to
@@ -1145,71 +1025,89 @@ static struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
*/
static int dccp_v6_init_sock(struct sock *sk)
{
- int err = dccp_v4_init_sock(sk);
+ static __u8 dccp_v6_ctl_sock_initialized;
+ int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized);
- if (err == 0)
+ if (err == 0) {
+ if (unlikely(!dccp_v6_ctl_sock_initialized))
+ dccp_v6_ctl_sock_initialized = 1;
inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
+ }
return err;
}
-static int dccp_v6_destroy_sock(struct sock *sk)
+static void dccp_v6_destroy_sock(struct sock *sk)
{
- dccp_v4_destroy_sock(sk);
- return inet6_destroy_sock(sk);
+ dccp_destroy_sock(sk);
+ inet6_destroy_sock(sk);
}
+static struct timewait_sock_ops dccp6_timewait_sock_ops = {
+ .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
+};
+
static struct proto dccp_v6_prot = {
- .name = "DCCPv6",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v6_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v6_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v6_do_rcv,
- .hash = dccp_v6_hash,
- .unhash = dccp_unhash,
- .accept = inet_csk_accept,
- .get_port = dccp_v6_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_v6_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp6_sock),
- .rsk_prot = &dccp6_request_sock_ops,
- .twsk_prot = &dccp6_timewait_sock_ops,
+ .name = "DCCPv6",
+ .owner = THIS_MODULE,
+ .close = dccp_close,
+ .connect = dccp_v6_connect,
+ .disconnect = dccp_disconnect,
+ .ioctl = dccp_ioctl,
+ .init = dccp_v6_init_sock,
+ .setsockopt = dccp_setsockopt,
+ .getsockopt = dccp_getsockopt,
+ .sendmsg = dccp_sendmsg,
+ .recvmsg = dccp_recvmsg,
+ .backlog_rcv = dccp_v6_do_rcv,
+ .hash = dccp_v6_hash,
+ .unhash = inet_unhash,
+ .accept = inet_csk_accept,
+ .get_port = inet_csk_get_port,
+ .shutdown = dccp_shutdown,
+ .destroy = dccp_v6_destroy_sock,
+ .orphan_count = &dccp_orphan_count,
+ .max_header = MAX_DCCP_HEADER,
+ .obj_size = sizeof(struct dccp6_sock),
+ .slab_flags = SLAB_DESTROY_BY_RCU,
+ .rsk_prot = &dccp6_request_sock_ops,
+ .twsk_prot = &dccp6_timewait_sock_ops,
+ .h.hashinfo = &dccp_hashinfo,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_dccp_setsockopt,
+ .compat_getsockopt = compat_dccp_getsockopt,
+#endif
};
-static struct inet6_protocol dccp_v6_protocol = {
- .handler = dccp_v6_rcv,
- .err_handler = dccp_v6_err,
- .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+static const struct inet6_protocol dccp_v6_protocol = {
+ .handler = dccp_v6_rcv,
+ .err_handler = dccp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
};
-static struct proto_ops inet6_dccp_ops = {
- .family = PF_INET6,
- .owner = THIS_MODULE,
- .release = inet6_release,
- .bind = inet6_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet6_getname,
- .poll = dccp_poll,
- .ioctl = inet6_ioctl,
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
+static const struct proto_ops inet6_dccp_ops = {
+ .family = PF_INET6,
+ .owner = THIS_MODULE,
+ .release = inet6_release,
+ .bind = inet6_bind,
+ .connect = inet_stream_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = inet_accept,
+ .getname = inet6_getname,
+ .poll = dccp_poll,
+ .ioctl = inet6_ioctl,
+ .listen = inet_dccp_listen,
+ .shutdown = inet_shutdown,
+ .setsockopt = sock_common_setsockopt,
+ .getsockopt = sock_common_getsockopt,
+ .sendmsg = inet_sendmsg,
+ .recvmsg = sock_common_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = sock_no_sendpage,
+#ifdef CONFIG_COMPAT
+ .compat_setsockopt = compat_sock_common_setsockopt,
+ .compat_getsockopt = compat_sock_common_getsockopt,
+#endif
};
static struct inet_protosw dccp_v6_protosw = {
@@ -1217,10 +1115,28 @@ static struct inet_protosw dccp_v6_protosw = {
.protocol = IPPROTO_DCCP,
.prot = &dccp_v6_prot,
.ops = &inet6_dccp_ops,
- .capability = -1,
.flags = INET_PROTOSW_ICSK,
};
+static int __net_init dccp_v6_init_net(struct net *net)
+{
+ if (dccp_hashinfo.bhash == NULL)
+ return -ESOCKTNOSUPPORT;
+
+ return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6,
+ SOCK_DCCP, IPPROTO_DCCP, net);
+}
+
+static void __net_exit dccp_v6_exit_net(struct net *net)
+{
+ inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
+}
+
+static struct pernet_operations dccp_v6_ops = {
+ .init = dccp_v6_init_net,
+ .exit = dccp_v6_exit_net,
+};
+
static int __init dccp_v6_init(void)
{
int err = proto_register(&dccp_v6_prot, 1);
@@ -1233,8 +1149,16 @@ static int __init dccp_v6_init(void)
goto out_unregister_proto;
inet6_register_protosw(&dccp_v6_protosw);
+
+ err = register_pernet_subsys(&dccp_v6_ops);
+ if (err != 0)
+ goto out_destroy_ctl_sock;
out:
return err;
+
+out_destroy_ctl_sock:
+ inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
+ inet6_unregister_protosw(&dccp_v6_protosw);
out_unregister_proto:
proto_unregister(&dccp_v6_prot);
goto out;
@@ -1242,6 +1166,7 @@ out_unregister_proto:
static void __exit dccp_v6_exit(void)
{
+ unregister_pernet_subsys(&dccp_v6_ops);
inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
inet6_unregister_protosw(&dccp_v6_protosw);
proto_unregister(&dccp_v6_prot);
@@ -1255,8 +1180,8 @@ module_exit(dccp_v6_exit);
* values directly, Also cover the case where the protocol is not specified,
* i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
*/
-MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6");
-MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
index e4d4e930927..af259e15e7f 100644
--- a/net/dccp/ipv6.h
+++ b/net/dccp/ipv6.h
@@ -11,7 +11,6 @@
* published by the Free Software Foundation.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/ipv6.h>
@@ -26,12 +25,10 @@ struct dccp6_sock {
struct dccp6_request_sock {
struct dccp_request_sock dccp;
- struct inet6_request_sock inet6;
};
struct dccp6_timewait_sock {
struct inet_timewait_sock inet;
- struct inet6_timewait_sock tw6;
};
#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index 29261fc198e..c69eb9c4fbb 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -10,8 +10,9 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
+#include <linux/gfp.h>
+#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/timer.h>
@@ -22,17 +23,17 @@
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
+#include "feat.h"
struct inet_timewait_death_row dccp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
.period = DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
- .death_lock = SPIN_LOCK_UNLOCKED,
+ .death_lock = __SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
.hashinfo = &dccp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&dccp_death_row),
.twkill_work = __WORK_INITIALIZER(dccp_death_row.twkill_work,
- inet_twdr_twkill_work,
- &dccp_death_row),
+ inet_twdr_twkill_work),
/* Short-time timewait calendar */
.twcal_hand = -1,
@@ -52,15 +53,12 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
if (tw != NULL) {
const struct inet_connection_sock *icsk = inet_csk(sk);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
const struct ipv6_pinfo *np = inet6_sk(sk);
- struct inet6_timewait_sock *tw6;
- tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot);
- tw6 = inet6_twsk((struct sock *)tw);
- ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr);
- ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr);
+ tw->tw_v6_daddr = sk->sk_v6_daddr;
+ tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
tw->tw_ipv6only = np->ipv6only;
}
#endif
@@ -83,8 +81,7 @@ void dccp_time_wait(struct sock *sk, int state, int timeo)
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- LIMIT_NETDEBUG(KERN_INFO "DCCP: time wait bucket "
- "table overflow\n");
+ DCCP_WARN("time wait bucket table overflow\n");
}
dccp_done(sk);
@@ -97,85 +94,52 @@ struct sock *dccp_create_openreq_child(struct sock *sk,
/*
* Step 3: Process LISTEN state
*
- * // Generate a new socket and switch to that socket
- * Set S := new socket for this port pair
+ * (* Generate a new socket and switch to that socket *)
+ * Set S := new socket for this port pair
*/
- struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC);
+ struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
if (newsk != NULL) {
- const struct dccp_request_sock *dreq = dccp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(sk);
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+ struct inet_connection_sock *newicsk = inet_csk(newsk);
struct dccp_sock *newdp = dccp_sk(newsk);
- newdp->dccps_role = DCCP_ROLE_SERVER;
- newdp->dccps_hc_rx_ackvec = NULL;
- newdp->dccps_service_list = NULL;
- newdp->dccps_service = dreq->dreq_service;
- newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
- do_gettimeofday(&newdp->dccps_epoch);
-
- if (newdp->dccps_options.dccpo_send_ack_vector) {
- newdp->dccps_hc_rx_ackvec =
- dccp_ackvec_alloc(DCCP_MAX_ACKVEC_LEN,
- GFP_ATOMIC);
- /*
- * XXX: We're using the same CCIDs set on the parent,
- * i.e. sk_clone copied the master sock and left the
- * CCID pointers for this child, that is why we do the
- * __ccid_get calls.
- */
- if (unlikely(newdp->dccps_hc_rx_ackvec == NULL))
- goto out_free;
- }
-
- if (unlikely(ccid_hc_rx_init(newdp->dccps_hc_rx_ccid,
- newsk) != 0 ||
- ccid_hc_tx_init(newdp->dccps_hc_tx_ccid,
- newsk) != 0)) {
- dccp_ackvec_free(newdp->dccps_hc_rx_ackvec);
- ccid_hc_rx_exit(newdp->dccps_hc_rx_ccid, newsk);
- ccid_hc_tx_exit(newdp->dccps_hc_tx_ccid, newsk);
-out_free:
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- newsk->sk_destruct = NULL;
- sk_free(newsk);
- return NULL;
- }
-
- __ccid_get(newdp->dccps_hc_rx_ccid);
- __ccid_get(newdp->dccps_hc_tx_ccid);
+ newdp->dccps_role = DCCP_ROLE_SERVER;
+ newdp->dccps_hc_rx_ackvec = NULL;
+ newdp->dccps_service_list = NULL;
+ newdp->dccps_service = dreq->dreq_service;
+ newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
+ newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
+ newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
+ INIT_LIST_HEAD(&newdp->dccps_featneg);
/*
* Step 3: Process LISTEN state
*
- * Choose S.ISS (initial seqno) or set from Init Cookie
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
- * Cookie
+ * Choose S.ISS (initial seqno) or set from Init Cookies
+ * Initialize S.GAR := S.ISS
+ * Set S.ISR, S.GSR from packet (or Init Cookies)
+ *
+ * Setting AWL/AWH and SWL/SWH happens as part of the feature
+ * activation below, as these windows all depend on the local
+ * and remote Sequence Window feature values (7.5.2).
*/
-
- /* See dccp_v4_conn_request */
- newdp->dccps_options.dccpo_sequence_window = req->rcv_wnd;
-
- newdp->dccps_gar = newdp->dccps_isr = dreq->dreq_isr;
- dccp_update_gsr(newsk, dreq->dreq_isr);
-
newdp->dccps_iss = dreq->dreq_iss;
- dccp_update_gss(newsk, dreq->dreq_iss);
+ newdp->dccps_gss = dreq->dreq_gss;
+ newdp->dccps_gar = newdp->dccps_iss;
+ newdp->dccps_isr = dreq->dreq_isr;
+ newdp->dccps_gsr = dreq->dreq_gsr;
/*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
+ * Activate features: initialise CCIDs, sequence windows etc.
*/
- dccp_set_seqno(&newdp->dccps_swl,
- max48(newdp->dccps_swl, newdp->dccps_isr));
- dccp_set_seqno(&newdp->dccps_awl,
- max48(newdp->dccps_awl, newdp->dccps_iss));
-
+ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free() */
+ newsk->sk_destruct = NULL;
+ sk_free(newsk);
+ return NULL;
+ }
dccp_init_xmit_timers(newsk);
DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS);
@@ -185,7 +149,7 @@ out_free:
EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
-/*
+/*
* Process an incoming packet for RESPOND sockets represented
* as an request_sock.
*/
@@ -194,19 +158,20 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock **prev)
{
struct sock *child = NULL;
+ struct dccp_request_sock *dreq = dccp_rsk(req);
/* Check for retransmitted REQUEST */
if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
- if (after48(DCCP_SKB_CB(skb)->dccpd_seq,
- dccp_rsk(req)->dreq_isr)) {
- struct dccp_request_sock *dreq = dccp_rsk(req);
+ if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
dccp_pr_debug("Retransmitted REQUEST\n");
- /* Send another RESPONSE packet */
- dccp_set_seqno(&dreq->dreq_iss, dreq->dreq_iss + 1);
- dccp_set_seqno(&dreq->dreq_isr,
- DCCP_SKB_CB(skb)->dccpd_seq);
- req->rsk_ops->rtx_syn_ack(sk, req, NULL);
+ dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
+ /*
+ * Send another RESPONSE packet
+ * To protect against Request floods, increment retrans
+ * counter (backoff, monitored by dccp_response_timer).
+ */
+ inet_rtx_syn_ack(sk, req);
}
/* Network Duplicate, discard packet */
return NULL;
@@ -219,22 +184,24 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
goto drop;
/* Invalid ACK */
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != dccp_rsk(req)->dreq_iss) {
+ if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
+ dreq->dreq_iss, dreq->dreq_gss)) {
dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
- "dreq_iss=%llu\n",
+ "dreq_iss=%llu, dreq_gss=%llu\n",
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long)
- dccp_rsk(req)->dreq_iss);
+ (unsigned long long) dreq->dreq_iss,
+ (unsigned long long) dreq->dreq_gss);
goto drop;
}
+ if (dccp_parse_options(sk, dreq, skb))
+ goto drop;
+
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
if (child == NULL)
goto listen_overflow;
- /* FIXME: deal with options */
-
inet_csk_reqsk_queue_unlink(sk, req, prev);
inet_csk_reqsk_queue_removed(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
@@ -245,7 +212,7 @@ listen_overflow:
DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
drop:
if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
- req->rsk_ops->send_reset(skb);
+ req->rsk_ops->send_reset(sk, skb);
inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
@@ -270,13 +237,13 @@ int dccp_child_process(struct sock *parent, struct sock *child,
/* Wakeup parent, send SIGIO */
if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent, 0);
+ parent->sk_data_ready(parent);
} else {
/* Alas, it is possible again, because we do lookup
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
- sk_add_backlog(child, skb);
+ __sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
@@ -285,3 +252,27 @@ int dccp_child_process(struct sock *parent, struct sock *child,
}
EXPORT_SYMBOL_GPL(dccp_child_process);
+
+void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *rsk)
+{
+ DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
+}
+
+EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
+
+int dccp_reqsk_init(struct request_sock *req,
+ struct dccp_sock const *dp, struct sk_buff const *skb)
+{
+ struct dccp_request_sock *dreq = dccp_rsk(req);
+
+ inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
+ inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
+ inet_rsk(req)->acked = 0;
+ dreq->dreq_timestamp_echo = 0;
+
+ /* inherit feature negotiation options from listening socket */
+ return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
index 0a76426c9ae..9bce31886bd 100644
--- a/net/dccp/options.c
+++ b/net/dccp/options.c
@@ -4,61 +4,54 @@
* An implementation of the DCCP protocol
* Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
* Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- * Copyright (c) 2005 Ian McDonald <iam4@cs.waikato.ac.nz>
+ * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/module.h>
#include <linux/types.h>
+#include <asm/unaligned.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include "ackvec.h"
#include "ccid.h"
#include "dccp.h"
+#include "feat.h"
-/* stores the default values for new connection. may be changed with sysctl */
-static const struct dccp_options dccpo_default_values = {
- .dccpo_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW,
- .dccpo_rx_ccid = DCCPF_INITIAL_CCID,
- .dccpo_tx_ccid = DCCPF_INITIAL_CCID,
- .dccpo_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR,
- .dccpo_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT,
-};
-
-void dccp_options_init(struct dccp_options *dccpo)
-{
- memcpy(dccpo, &dccpo_default_values, sizeof(*dccpo));
-}
-
-static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len)
+u64 dccp_decode_value_var(const u8 *bf, const u8 len)
{
- u32 value = 0;
+ u64 value = 0;
+ if (len >= DCCP_OPTVAL_MAXLEN)
+ value += ((u64)*bf++) << 40;
+ if (len > 4)
+ value += ((u64)*bf++) << 32;
if (len > 3)
- value += *bf++ << 24;
+ value += ((u64)*bf++) << 24;
if (len > 2)
- value += *bf++ << 16;
+ value += ((u64)*bf++) << 16;
if (len > 1)
- value += *bf++ << 8;
+ value += ((u64)*bf++) << 8;
if (len > 0)
value += *bf;
return value;
}
-int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_parse_options - Parse DCCP options present in @skb
+ * @sk: client|server|listening dccp socket (when @dreq != NULL)
+ * @dreq: request socket to use during connection setup, or NULL
+ */
+int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
-#ifdef CONFIG_IP_DCCP_DEBUG
- const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
- "CLIENT rx opt: " : "server rx opt: ";
-#endif
const struct dccp_hdr *dh = dccp_hdr(skb);
const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
@@ -67,11 +60,15 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
(dh->dccph_doff * 4);
struct dccp_options_received *opt_recv = &dp->dccps_options_received;
unsigned char opt, len;
- unsigned char *value;
+ unsigned char *uninitialized_var(value);
u32 elapsed_time;
+ __be32 opt_val;
+ int rc;
+ int mandatory = 0;
memset(opt_recv, 0, sizeof(*opt_recv));
+ opt = len = 0;
while (opt_ptr != opt_end) {
opt = *opt_ptr++;
len = 0;
@@ -80,11 +77,11 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
/* Check if this isn't a single byte option */
if (opt > DCCPO_MAX_RESERVED) {
if (opt_ptr == opt_end)
- goto out_invalid_option;
+ goto out_nonsensical_length;
len = *opt_ptr++;
- if (len < 3)
- goto out_invalid_option;
+ if (len < 2)
+ goto out_nonsensical_length;
/*
* Remove the type and len fields, leaving
* just the value size
@@ -94,134 +91,187 @@ int dccp_parse_options(struct sock *sk, struct sk_buff *skb)
opt_ptr += len;
if (opt_ptr > opt_end)
- goto out_invalid_option;
+ goto out_nonsensical_length;
}
+ /*
+ * CCID-specific options are ignored during connection setup, as
+ * negotiation may still be in progress (see RFC 4340, 10.3).
+ * The same applies to Ack Vectors, as these depend on the CCID.
+ */
+ if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
+ opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
+ goto ignore_option;
+
switch (opt) {
case DCCPO_PADDING:
break;
+ case DCCPO_MANDATORY:
+ if (mandatory)
+ goto out_invalid_option;
+ if (pkt_type != DCCP_PKT_DATA)
+ mandatory = 1;
+ break;
case DCCPO_NDP_COUNT:
- if (len > 3)
+ if (len > 6)
goto out_invalid_option;
opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
- dccp_pr_debug("%sNDP count=%d\n", debug_prefix,
- opt_recv->dccpor_ndp);
+ dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
+ (unsigned long long)opt_recv->dccpor_ndp);
break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (pkt_type == DCCP_PKT_DATA)
- continue;
-
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_parse(sk, skb, opt, value, len))
+ case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
+ if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
+ break;
+ if (len == 0)
goto out_invalid_option;
+ rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
+ *value, value + 1, len - 1);
+ if (rc)
+ goto out_featneg_failed;
break;
case DCCPO_TIMESTAMP:
if (len != 4)
goto out_invalid_option;
-
- opt_recv->dccpor_timestamp = ntohl(*(u32 *)value);
-
- dp->dccps_timestamp_echo = opt_recv->dccpor_timestamp;
- dccp_timestamp(sk, &dp->dccps_timestamp_time);
-
- dccp_pr_debug("%sTIMESTAMP=%u, ackno=%llu\n",
- debug_prefix, opt_recv->dccpor_timestamp,
+ /*
+ * RFC 4340 13.1: "The precise time corresponding to
+ * Timestamp Value zero is not specified". We use
+ * zero to indicate absence of a meaningful timestamp.
+ */
+ opt_val = get_unaligned((__be32 *)value);
+ if (unlikely(opt_val == 0)) {
+ DCCP_WARN("Timestamp with zero value\n");
+ break;
+ }
+
+ if (dreq != NULL) {
+ dreq->dreq_timestamp_echo = ntohl(opt_val);
+ dreq->dreq_timestamp_time = dccp_timestamp();
+ } else {
+ opt_recv->dccpor_timestamp =
+ dp->dccps_timestamp_echo = ntohl(opt_val);
+ dp->dccps_timestamp_time = dccp_timestamp();
+ }
+ dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
+ dccp_role(sk), ntohl(opt_val),
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ /* schedule an Ack in case this sender is quiescent */
+ inet_csk_schedule_ack(sk);
break;
case DCCPO_TIMESTAMP_ECHO:
if (len != 4 && len != 6 && len != 8)
goto out_invalid_option;
- opt_recv->dccpor_timestamp_echo = ntohl(*(u32 *)value);
+ opt_val = get_unaligned((__be32 *)value);
+ opt_recv->dccpor_timestamp_echo = ntohl(opt_val);
- dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, ackno=%llu, ",
- debug_prefix,
+ dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
+ "ackno=%llu", dccp_role(sk),
opt_recv->dccpor_timestamp_echo,
len + 2,
(unsigned long long)
DCCP_SKB_CB(skb)->dccpd_ack_seq);
+ value += 4;
- if (len == 4)
+ if (len == 4) { /* no elapsed time included */
+ dccp_pr_debug_cat("\n");
break;
+ }
+
+ if (len == 6) { /* 2-byte elapsed time */
+ __be16 opt_val2 = get_unaligned((__be16 *)value);
+ elapsed_time = ntohs(opt_val2);
+ } else { /* 4-byte elapsed time */
+ opt_val = get_unaligned((__be32 *)value);
+ elapsed_time = ntohl(opt_val);
+ }
- if (len == 6)
- elapsed_time = ntohs(*(u16 *)(value + 4));
- else
- elapsed_time = ntohl(*(u32 *)(value + 4));
+ dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time);
/* Give precedence to the biggest ELAPSED_TIME */
if (elapsed_time > opt_recv->dccpor_elapsed_time)
opt_recv->dccpor_elapsed_time = elapsed_time;
break;
case DCCPO_ELAPSED_TIME:
- if (len != 2 && len != 4)
- goto out_invalid_option;
-
- if (pkt_type == DCCP_PKT_DATA)
- continue;
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */
+ break;
- if (len == 2)
- elapsed_time = ntohs(*(u16 *)value);
- else
- elapsed_time = ntohl(*(u32 *)value);
+ if (len == 2) {
+ __be16 opt_val2 = get_unaligned((__be16 *)value);
+ elapsed_time = ntohs(opt_val2);
+ } else if (len == 4) {
+ opt_val = get_unaligned((__be32 *)value);
+ elapsed_time = ntohl(opt_val);
+ } else {
+ goto out_invalid_option;
+ }
if (elapsed_time > opt_recv->dccpor_elapsed_time)
opt_recv->dccpor_elapsed_time = elapsed_time;
- dccp_pr_debug("%sELAPSED_TIME=%d\n", debug_prefix,
- elapsed_time);
+ dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
+ dccp_role(sk), elapsed_time);
break;
- /*
- * From draft-ietf-dccp-spec-11.txt:
- *
- * Option numbers 128 through 191 are for
- * options sent from the HC-Sender to the
- * HC-Receiver; option numbers 192 through 255
- * are for options sent from the HC-Receiver to
- * the HC-Sender.
- */
- case 128 ... 191: {
- const u16 idx = value - options;
-
+ case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
- case 192 ... 255: {
- const u16 idx = value - options;
-
+ case DCCPO_ACK_VECTOR_0:
+ case DCCPO_ACK_VECTOR_1:
+ if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
+ break;
+ /*
+ * Ack vectors are processed by the TX CCID if it is
+ * interested. The RX CCID need not parse Ack Vectors,
+ * since it is only interested in clearing old state.
+ * Fall through.
+ */
+ case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- opt, len, idx,
- value) != 0)
+ pkt_type, opt, value, len))
goto out_invalid_option;
- }
break;
default:
- pr_info("DCCP(%p): option %d(len=%d) not "
- "implemented, ignoring\n",
- sk, opt, len);
+ DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
+ "implemented, ignoring", sk, opt, len);
break;
- }
+ }
+ignore_option:
+ if (opt != DCCPO_MANDATORY)
+ mandatory = 0;
}
+ /* mandatory was the last byte in option list -> reset connection */
+ if (mandatory)
+ goto out_invalid_option;
+
+out_nonsensical_length:
+ /* RFC 4340, 5.8: ignore option and all remaining option space */
return 0;
out_invalid_option:
DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT);
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR;
- pr_info("DCCP(%p): invalid option %d, len=%d\n", sk, opt, len);
+ rc = DCCP_RESET_CODE_OPTION_ERROR;
+out_featneg_failed:
+ DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
+ DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
+ DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
return -1;
}
-static void dccp_encode_value_var(const u32 value, unsigned char *to,
- const unsigned int len)
+EXPORT_SYMBOL_GPL(dccp_parse_options);
+
+void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
{
+ if (len >= DCCP_OPTVAL_MAXLEN)
+ *to++ = (value & 0xFF0000000000ull) >> 40;
+ if (len > 4)
+ *to++ = (value & 0xFF00000000ull) >> 32;
if (len > 3)
*to++ = (value & 0xFF000000) >> 24;
if (len > 2)
@@ -232,22 +282,20 @@ static void dccp_encode_value_var(const u32 value, unsigned char *to,
*to++ = (value & 0xFF);
}
-static inline int dccp_ndp_len(const int ndp)
+static inline u8 dccp_ndp_len(const u64 ndp)
{
- return likely(ndp <= 0xFF) ? 1 : ndp <= 0xFFFF ? 2 : 3;
+ if (likely(ndp <= 0xFF))
+ return 1;
+ return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
}
-void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
- const unsigned char option,
- const void *value, const unsigned char len)
+int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
+ const void *value, const unsigned char len)
{
unsigned char *to;
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN) {
- LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
- "%d option!\n", option);
- return;
- }
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
+ return -1;
DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
@@ -256,14 +304,15 @@ void dccp_insert_option(struct sock *sk, struct sk_buff *skb,
*to++ = len + 2;
memcpy(to, value, len);
+ return 0;
}
EXPORT_SYMBOL_GPL(dccp_insert_option);
-static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
+static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
- int ndp = dp->dccps_ndp_count;
+ u64 ndp = dp->dccps_ndp_count;
if (dccp_non_data_packet(skb))
++dp->dccps_ndp_count;
@@ -276,7 +325,7 @@ static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
const int len = ndp_len + 2;
if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return;
+ return -1;
DCCP_SKB_CB(skb)->dccpd_opt_len += len;
@@ -285,6 +334,8 @@ static void dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
*ptr++ = len;
dccp_encode_value_var(ndp, ptr, ndp_len);
}
+
+ return 0;
}
static inline int dccp_elapsed_time_len(const u32 elapsed_time)
@@ -292,171 +343,267 @@ static inline int dccp_elapsed_time_len(const u32 elapsed_time)
return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
}
-void dccp_insert_option_elapsed_time(struct sock *sk,
- struct sk_buff *skb,
- u32 elapsed_time)
+static int dccp_insert_option_timestamp(struct sk_buff *skb)
{
-#ifdef CONFIG_IP_DCCP_DEBUG
- struct dccp_sock *dp = dccp_sk(sk);
- const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
- "CLIENT TX opt: " : "server TX opt: ";
-#endif
- const int elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- const int len = 2 + elapsed_time_len;
- unsigned char *to;
+ __be32 now = htonl(dccp_timestamp());
+ /* yes this will overflow but that is the point as we want a
+ * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
- if (elapsed_time_len == 0)
- return;
+ return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+}
+
+static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
+ struct dccp_request_sock *dreq,
+ struct sk_buff *skb)
+{
+ __be32 tstamp_echo;
+ unsigned char *to;
+ u32 elapsed_time, elapsed_time_len, len;
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
- LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to "
- "insert elapsed time!\n");
- return;
+ if (dreq != NULL) {
+ elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
+ tstamp_echo = htonl(dreq->dreq_timestamp_echo);
+ dreq->dreq_timestamp_echo = 0;
+ } else {
+ elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
+ tstamp_echo = htonl(dp->dccps_timestamp_echo);
+ dp->dccps_timestamp_echo = 0;
}
+ elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
+ len = 6 + elapsed_time_len;
+
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
+ return -1;
+
DCCP_SKB_CB(skb)->dccpd_opt_len += len;
to = skb_push(skb, len);
- *to++ = DCCPO_ELAPSED_TIME;
+ *to++ = DCCPO_TIMESTAMP_ECHO;
*to++ = len;
+ memcpy(to, &tstamp_echo, 4);
+ to += 4;
+
if (elapsed_time_len == 2) {
- const u16 var16 = htons((u16)elapsed_time);
+ const __be16 var16 = htons((u16)elapsed_time);
memcpy(to, &var16, 2);
- } else {
- const u32 var32 = htonl(elapsed_time);
+ } else if (elapsed_time_len == 4) {
+ const __be32 var32 = htonl(elapsed_time);
memcpy(to, &var32, 4);
}
- dccp_pr_debug("%sELAPSED_TIME=%u, len=%d, seqno=%llu\n",
- debug_prefix, elapsed_time,
- len,
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+ return 0;
}
-EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time);
-
-void dccp_timestamp(const struct sock *sk, struct timeval *tv)
+static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
{
- const struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+ const u16 buflen = dccp_ackvec_buflen(av);
+ /* Figure out how many options do we need to represent the ackvec */
+ const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
+ u16 len = buflen + 2 * nr_opts;
+ u8 i, nonce = 0;
+ const unsigned char *tail, *from;
+ unsigned char *to;
+
+ if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
+ dccp_packet_name(dcb->dccpd_type));
+ return -1;
+ }
+ /*
+ * Since Ack Vectors are variable-length, we can not always predict
+ * their size. To catch exception cases where the space is running out
+ * on the skb, a separate Sync is scheduled to carry the Ack Vector.
+ */
+ if (len > DCCPAV_MIN_OPTLEN &&
+ len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
+ DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
+ "MPS=%u ==> reduce payload size?\n", len, skb->len,
+ dcb->dccpd_opt_len, dp->dccps_mss_cache);
+ dp->dccps_sync_scheduled = 1;
+ return 0;
+ }
+ dcb->dccpd_opt_len += len;
+
+ to = skb_push(skb, len);
+ len = buflen;
+ from = av->av_buf + av->av_buf_head;
+ tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
+
+ for (i = 0; i < nr_opts; ++i) {
+ int copylen = len;
- do_gettimeofday(tv);
- tv->tv_sec -= dp->dccps_epoch.tv_sec;
- tv->tv_usec -= dp->dccps_epoch.tv_usec;
+ if (len > DCCP_SINGLE_OPT_MAXLEN)
+ copylen = DCCP_SINGLE_OPT_MAXLEN;
- while (tv->tv_usec < 0) {
- tv->tv_sec--;
- tv->tv_usec += USEC_PER_SEC;
+ /*
+ * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
+ * its type; ack_nonce is the sum of all individual buf_nonce's.
+ */
+ nonce ^= av->av_buf_nonce[i];
+
+ *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
+ *to++ = copylen + 2;
+
+ /* Check if buf_head wraps */
+ if (from + copylen > tail) {
+ const u16 tailsize = tail - from;
+
+ memcpy(to, from, tailsize);
+ to += tailsize;
+ len -= tailsize;
+ copylen -= tailsize;
+ from = av->av_buf;
+ }
+
+ memcpy(to, from, copylen);
+ from += copylen;
+ to += copylen;
+ len -= copylen;
}
+ /*
+ * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
+ */
+ if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
+ return -ENOBUFS;
+ return 0;
}
-EXPORT_SYMBOL_GPL(dccp_timestamp);
-
-void dccp_insert_option_timestamp(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_insert_option_mandatory - Mandatory option (5.8.2)
+ * Note that since we are using skb_push, this function needs to be called
+ * _after_ inserting the option it is supposed to influence (stack order).
+ */
+int dccp_insert_option_mandatory(struct sk_buff *skb)
{
- struct timeval tv;
- u32 now;
-
- dccp_timestamp(sk, &tv);
- now = timeval_usecs(&tv) / 10;
- /* yes this will overflow but that is the point as we want a
- * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
+ return -1;
- now = htonl(now);
- dccp_insert_option(sk, skb, DCCPO_TIMESTAMP, &now, sizeof(now));
+ DCCP_SKB_CB(skb)->dccpd_opt_len++;
+ *skb_push(skb, 1) = DCCPO_MANDATORY;
+ return 0;
}
-EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp);
-
-static void dccp_insert_option_timestamp_echo(struct sock *sk,
- struct sk_buff *skb)
+/**
+ * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
+ * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
+ * @feat: one out of %dccp_feature_numbers
+ * @val: NN value or SP array (preferred element first) to copy
+ * @len: true length of @val in bytes (excluding first element repetition)
+ * @repeat_first: whether to copy the first element of @val twice
+ *
+ * The last argument is used to construct Confirm options, where the preferred
+ * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
+ * lists are kept such that the preferred entry is always first, so we only need
+ * to copy twice, and avoid the overhead of cloning into a bigger array.
+ */
+int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
+ u8 *val, u8 len, bool repeat_first)
{
- struct dccp_sock *dp = dccp_sk(sk);
-#ifdef CONFIG_IP_DCCP_DEBUG
- const char *debug_prefix = dp->dccps_role == DCCP_ROLE_CLIENT ?
- "CLIENT TX opt: " : "server TX opt: ";
-#endif
- struct timeval now;
- u32 tstamp_echo;
- u32 elapsed_time;
- int len, elapsed_time_len;
- unsigned char *to;
-
- dccp_timestamp(sk, &now);
- elapsed_time = timeval_delta(&now, &dp->dccps_timestamp_time) / 10;
- elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- len = 6 + elapsed_time_len;
+ u8 tot_len, *to;
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
- LIMIT_NETDEBUG(KERN_INFO "DCCP: packet too small to insert "
- "timestamp echo!\n");
- return;
+ /* take the `Feature' field and possible repetition into account */
+ if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
+ DCCP_WARN("length %u for feature %u too large\n", len, feat);
+ return -1;
}
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
+ if (unlikely(val == NULL || len == 0))
+ len = repeat_first = false;
+ tot_len = 3 + repeat_first + len;
- to = skb_push(skb, len);
- *to++ = DCCPO_TIMESTAMP_ECHO;
- *to++ = len;
-
- tstamp_echo = htonl(dp->dccps_timestamp_echo);
- memcpy(to, &tstamp_echo, 4);
- to += 4;
-
- if (elapsed_time_len == 2) {
- const u16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else if (elapsed_time_len == 4) {
- const u32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
+ if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
+ DCCP_WARN("packet too small for feature %d option!\n", feat);
+ return -1;
}
+ DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
+
+ to = skb_push(skb, tot_len);
+ *to++ = type;
+ *to++ = tot_len;
+ *to++ = feat;
- dccp_pr_debug("%sTIMESTAMP_ECHO=%u, len=%d, seqno=%llu\n",
- debug_prefix, dp->dccps_timestamp_echo,
- len,
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
+ if (repeat_first)
+ *to++ = *val;
+ if (len)
+ memcpy(to, val, len);
+ return 0;
+}
+
+/* The length of all options needs to be a multiple of 4 (5.8) */
+static void dccp_insert_option_padding(struct sk_buff *skb)
+{
+ int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
- dp->dccps_timestamp_echo = 0;
- dp->dccps_timestamp_time.tv_sec = 0;
- dp->dccps_timestamp_time.tv_usec = 0;
+ if (padding != 0) {
+ padding = 4 - padding;
+ memset(skb_push(skb, padding), 0, padding);
+ DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
+ }
}
-void dccp_insert_options(struct sock *sk, struct sk_buff *skb)
+int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
{
struct dccp_sock *dp = dccp_sk(sk);
DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
- if (dp->dccps_options.dccpo_send_ndp_count)
- dccp_insert_option_ndp(sk, skb);
+ if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
+ return -1;
+
+ if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
- if (!dccp_packet_without_ack(skb)) {
- if (dp->dccps_options.dccpo_send_ack_vector &&
- dccp_ackvec_pending(dp->dccps_hc_rx_ackvec))
- dccp_insert_option_ackvec(sk, skb);
- if (dp->dccps_timestamp_echo != 0)
- dccp_insert_option_timestamp_echo(sk, skb);
+ /* Feature Negotiation */
+ if (dccp_feat_insert_opts(dp, NULL, skb))
+ return -1;
+
+ if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
+ /*
+ * Obtain RTT sample from Request/Response exchange.
+ * This is currently used for TFRC initialisation.
+ */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
+ } else if (dccp_ackvec_pending(sk) &&
+ dccp_insert_option_ackvec(sk, skb)) {
+ return -1;
+ }
}
if (dp->dccps_hc_rx_insert_options) {
- ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb);
+ if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
+ return -1;
dp->dccps_hc_rx_insert_options = 0;
}
- if (dp->dccps_hc_tx_insert_options) {
- ccid_hc_tx_insert_options(dp->dccps_hc_tx_ccid, sk, skb);
- dp->dccps_hc_tx_insert_options = 0;
- }
- /* XXX: insert other options when appropriate */
+ if (dp->dccps_timestamp_echo != 0 &&
+ dccp_insert_option_timestamp_echo(dp, NULL, skb))
+ return -1;
- if (DCCP_SKB_CB(skb)->dccpd_opt_len != 0) {
- /* The length of all options has to be a multiple of 4 */
- int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
+ dccp_insert_option_padding(skb);
+ return 0;
+}
- if (padding != 0) {
- padding = 4 - padding;
- memset(skb_push(skb, padding), 0, padding);
- DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
- }
- }
+int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
+{
+ DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
+
+ if (dccp_feat_insert_opts(NULL, dreq, skb))
+ return -1;
+
+ /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
+ if (dccp_insert_option_timestamp(skb))
+ return -1;
+
+ if (dreq->dreq_timestamp_echo != 0 &&
+ dccp_insert_option_timestamp_echo(NULL, dreq, skb))
+ return -1;
+
+ dccp_insert_option_padding(skb);
+ return 0;
}
diff --git a/net/dccp/output.c b/net/dccp/output.c
index efd7ffb903a..0248e8a3460 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -1,6 +1,6 @@
/*
* net/dccp/output.c
- *
+ *
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
@@ -10,10 +10,10 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/inet_sock.h>
#include <net/sock.h>
@@ -27,11 +27,13 @@ static inline void dccp_event_ack_sent(struct sock *sk)
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
-static inline void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
+/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
+static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
skb_set_owner_w(skb, sk);
WARN_ON(sk->sk_send_head);
sk->sk_send_head = skb;
+ return skb_clone(sk->sk_send_head, gfp_any());
}
/*
@@ -43,58 +45,72 @@ static inline void dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
{
if (likely(skb != NULL)) {
- const struct inet_sock *inet = inet_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
struct dccp_hdr *dh;
/* XXX For now we're using only 48 bits sequence numbers */
- const int dccp_header_size = sizeof(*dh) +
+ const u32 dccp_header_size = sizeof(*dh) +
sizeof(struct dccp_hdr_ext) +
dccp_packet_hdr_len(dcb->dccpd_type);
int err, set_ack = 1;
u64 ackno = dp->dccps_gsr;
-
- dccp_inc_seqno(&dp->dccps_gss);
+ /*
+ * Increment GSS here already in case the option code needs it.
+ * Update GSS for real only if option processing below succeeds.
+ */
+ dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
switch (dcb->dccpd_type) {
case DCCP_PKT_DATA:
set_ack = 0;
/* fall through */
case DCCP_PKT_DATAACK:
+ case DCCP_PKT_RESET:
break;
+ case DCCP_PKT_REQUEST:
+ set_ack = 0;
+ /* Use ISS on the first (non-retransmitted) Request. */
+ if (icsk->icsk_retransmits == 0)
+ dcb->dccpd_seq = dp->dccps_iss;
+ /* fall through */
+
case DCCP_PKT_SYNC:
case DCCP_PKT_SYNCACK:
- ackno = dcb->dccpd_seq;
+ ackno = dcb->dccpd_ack_seq;
/* fall through */
default:
/*
- * Only data packets should come through with skb->sk
- * set.
+ * Set owner/destructor: some skbs are allocated via
+ * alloc_skb (e.g. when retransmission may happen).
+ * Only Data, DataAck, and Reset packets should come
+ * through here with skb->sk set.
*/
WARN_ON(skb->sk);
skb_set_owner_w(skb, sk);
break;
}
- dcb->dccpd_seq = dp->dccps_gss;
- dccp_insert_options(sk, skb);
-
- skb->h.raw = skb_push(skb, dccp_header_size);
- dh = dccp_hdr(skb);
+ if (dccp_insert_options(sk, skb)) {
+ kfree_skb(skb);
+ return -EPROTO;
+ }
+
/* Build DCCP header and checksum it. */
- memset(dh, 0, dccp_header_size);
+ dh = dccp_zeroed_hdr(skb, dccp_header_size);
dh->dccph_type = dcb->dccpd_type;
- dh->dccph_sport = inet->sport;
- dh->dccph_dport = inet->dport;
+ dh->dccph_sport = inet->inet_sport;
+ dh->dccph_dport = inet->inet_dport;
dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
dh->dccph_ccval = dcb->dccpd_ccval;
+ dh->dccph_cscov = dp->dccps_pcslen;
/* XXX For now we're using only 48 bits sequence numbers */
dh->dccph_x = 1;
- dp->dccps_awh = dp->dccps_gss;
+ dccp_update_gss(sk, dcb->dccpd_seq);
dccp_hdr_set_seq(dh, dp->dccps_gss);
if (set_ack)
dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
@@ -103,6 +119,11 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
case DCCP_PKT_REQUEST:
dccp_hdr_request(skb)->dccph_req_service =
dp->dccps_service;
+ /*
+ * Limit Ack window to ISS <= P.ackno <= GSS, so that
+ * only Responses to Requests we sent are considered.
+ */
+ dp->dccps_awl = dp->dccps_iss;
break;
case DCCP_PKT_RESET:
dccp_hdr_reset(skb)->dccph_reset_code =
@@ -110,164 +131,263 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
break;
}
- icsk->icsk_af_ops->send_check(sk, skb->len, skb);
+ icsk->icsk_af_ops->send_check(sk, skb);
if (set_ack)
dccp_event_ack_sent(sk);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- err = icsk->icsk_af_ops->queue_xmit(skb, 0);
- if (err <= 0)
- return err;
-
- /* NET_XMIT_CN is special. It does not guarantee,
- * that this packet is lost. It tells that device
- * is about to start to drop packets or already
- * drops some packets of the same priority and
- * invokes us to send less aggressively.
- */
- return err == NET_XMIT_CN ? 0 : err;
+ err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+ return net_xmit_eval(err);
}
return -ENOBUFS;
}
+/**
+ * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
+ * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
+ * since the RX CCID is restricted to feedback packets (Acks), which are small
+ * in comparison with the data traffic. A value of 0 means "no current CCMPS".
+ */
+static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
+{
+ const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
+
+ if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
+ return 0;
+ return tx_ccid->ccid_ops->ccid_ccmps;
+}
+
unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct dccp_sock *dp = dccp_sk(sk);
- int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
- sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext));
+ u32 ccmps = dccp_determine_ccmps(dp);
+ u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
- /* Now subtract optional transport overhead */
- mss_now -= icsk->icsk_ext_hdr_len;
+ /* Account for header lengths and IPv4/v6 option overhead */
+ cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
+ sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
/*
- * FIXME: this should come from the CCID infrastructure, where, say,
- * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets
- * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED
- * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to
- * make it a multiple of 4
+ * Leave enough headroom for common DCCP header options.
+ * This only considers options which may appear on DCCP-Data packets, as
+ * per table 3 in RFC 4340, 5.8. When running out of space for other
+ * options (eg. Ack Vector which can take up to 255 bytes), it is better
+ * to schedule a separate Ack. Thus we leave headroom for the following:
+ * - 1 byte for Slow Receiver (11.6)
+ * - 6 bytes for Timestamp (13.1)
+ * - 10 bytes for Timestamp Echo (13.3)
+ * - 8 bytes for NDP count (7.7, when activated)
+ * - 6 bytes for Data Checksum (9.3)
+ * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
*/
-
- mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4;
+ cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
+ (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
- dp->dccps_mss_cache = mss_now;
+ dp->dccps_mss_cache = cur_mps;
- return mss_now;
+ return cur_mps;
}
EXPORT_SYMBOL_GPL(dccp_sync_mss);
void dccp_write_space(struct sock *sk)
{
- read_lock(&sk->sk_callback_lock);
+ struct socket_wq *wq;
- if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq_has_sleeper(wq))
+ wake_up_interruptible(&wq->wait);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
- sk_wake_async(sk, 2, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
- read_unlock(&sk->sk_callback_lock);
+ rcu_read_unlock();
}
/**
- * dccp_wait_for_ccid - Wait for ccid to tell us we can send a packet
- * @sk: socket to wait for
- * @timeo: for how long
+ * dccp_wait_for_ccid - Await CCID send permission
+ * @sk: socket to wait for
+ * @delay: timeout in jiffies
+ *
+ * This is used by CCIDs which need to delay the send time in process context.
*/
-static int dccp_wait_for_ccid(struct sock *sk, struct sk_buff *skb,
- long *timeo)
+static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
{
- struct dccp_sock *dp = dccp_sk(sk);
DEFINE_WAIT(wait);
- long delay;
- int rc;
-
- while (1) {
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
-
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto do_error;
- if (!*timeo)
- goto do_nonblock;
- if (signal_pending(current))
- goto do_interrupted;
-
- rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
- skb->len);
- if (rc <= 0)
- break;
- delay = msecs_to_jiffies(rc);
- if (delay > *timeo || delay < 0)
- goto do_nonblock;
-
- sk->sk_write_pending++;
- release_sock(sk);
- *timeo -= schedule_timeout(delay);
- lock_sock(sk);
- sk->sk_write_pending--;
+ long remaining;
+
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ sk->sk_write_pending++;
+ release_sock(sk);
+
+ remaining = schedule_timeout(delay);
+
+ lock_sock(sk);
+ sk->sk_write_pending--;
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (signal_pending(current) || sk->sk_err)
+ return -1;
+ return remaining;
+}
+
+/**
+ * dccp_xmit_packet - Send data packet under control of CCID
+ * Transmits next-queued payload and informs CCID to account for the packet.
+ */
+static void dccp_xmit_packet(struct sock *sk)
+{
+ int err, len;
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb = dccp_qpolicy_pop(sk);
+
+ if (unlikely(skb == NULL))
+ return;
+ len = skb->len;
+
+ if (sk->sk_state == DCCP_PARTOPEN) {
+ const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
+ /*
+ * See 8.1.5 - Handshake Completion.
+ *
+ * For robustness we resend Confirm options until the client has
+ * entered OPEN. During the initial feature negotiation, the MPS
+ * is smaller than usual, reduced by the Change/Confirm options.
+ */
+ if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
+ DCCP_WARN("Payload too large (%d) for featneg.\n", len);
+ dccp_send_ack(sk);
+ dccp_feat_list_purge(&dp->dccps_featneg);
+ }
+
+ inet_csk_schedule_ack(sk);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ inet_csk(sk)->icsk_rto,
+ DCCP_RTO_MAX);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else if (dccp_ack_pending(sk)) {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
+ } else {
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
}
-out:
- finish_wait(sk->sk_sleep, &wait);
- return rc;
-
-do_error:
- rc = -EPIPE;
- goto out;
-do_nonblock:
- rc = -EAGAIN;
- goto out;
-do_interrupted:
- rc = sock_intr_errno(*timeo);
- goto out;
+
+ err = dccp_transmit_skb(sk, skb);
+ if (err)
+ dccp_pr_debug("transmit_skb() returned err=%d\n", err);
+ /*
+ * Register this one as sent even if an error occurred. To the remote
+ * end a local packet drop is indistinguishable from network loss, i.e.
+ * any local drop will eventually be reported via receiver feedback.
+ */
+ ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
+
+ /*
+ * If the CCID needs to transfer additional header options out-of-band
+ * (e.g. Ack Vectors or feature-negotiation options), it activates this
+ * flag to schedule a Sync. The Sync will automatically incorporate all
+ * currently pending header options, thus clearing the backlog.
+ */
+ if (dp->dccps_sync_scheduled)
+ dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
}
-int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo)
+/**
+ * dccp_flush_write_queue - Drain queue at end of connection
+ * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
+ * happen that the TX queue is not empty at the end of a connection. We give the
+ * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
+ * returns with a non-empty write queue, it will be purged later.
+ */
+void dccp_flush_write_queue(struct sock *sk, long *time_budget)
{
- const struct dccp_sock *dp = dccp_sk(sk);
- int err = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb,
- skb->len);
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+ long delay, rc;
- if (err > 0)
- err = dccp_wait_for_ccid(sk, skb, timeo);
+ while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
+ rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
- if (err == 0) {
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int len = skb->len;
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ /*
+ * If the CCID determines when to send, the next sending
+ * time is unknown or the CCID may not even send again
+ * (e.g. remote host crashes or lost Ack packets).
+ */
+ DCCP_WARN("CCID did not manage to send all packets\n");
+ return;
+ case CCID_PACKET_DELAY:
+ delay = msecs_to_jiffies(rc);
+ if (delay > *time_budget)
+ return;
+ rc = dccp_wait_for_ccid(sk, delay);
+ if (rc < 0)
+ return;
+ *time_budget -= (delay - rc);
+ /* check again if we can send now */
+ break;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ skb_dequeue(&sk->sk_write_queue);
+ kfree_skb(skb);
+ dccp_pr_debug("packet discarded due to err=%ld\n", rc);
+ }
+ }
+}
- if (sk->sk_state == DCCP_PARTOPEN) {
- /* See 8.1.5. Handshake Completion */
- inet_csk_schedule_ack(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- } else if (dccp_ack_pending(sk))
- dcb->dccpd_type = DCCP_PKT_DATAACK;
- else
- dcb->dccpd_type = DCCP_PKT_DATA;
-
- err = dccp_transmit_skb(sk, skb);
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len);
- } else
- kfree_skb(skb);
-
- return err;
+void dccp_write_xmit(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct sk_buff *skb;
+
+ while ((skb = dccp_qpolicy_top(sk))) {
+ int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
+
+ switch (ccid_packet_dequeue_eval(rc)) {
+ case CCID_PACKET_WILL_DEQUEUE_LATER:
+ return;
+ case CCID_PACKET_DELAY:
+ sk_reset_timer(sk, &dp->dccps_xmit_timer,
+ jiffies + msecs_to_jiffies(rc));
+ return;
+ case CCID_PACKET_SEND_AT_ONCE:
+ dccp_xmit_packet(sk);
+ break;
+ case CCID_PACKET_ERR:
+ dccp_qpolicy_drop(sk, skb);
+ dccp_pr_debug("packet discarded due to err=%d\n", rc);
+ }
+ }
}
-int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+/**
+ * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
+ * There are only four retransmittable packet types in DCCP:
+ * - Request in client-REQUEST state (sec. 8.1.1),
+ * - CloseReq in server-CLOSEREQ state (sec. 8.3),
+ * - Close in node-CLOSING state (sec. 8.3),
+ * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
+ * This function expects sk->sk_send_head to contain the original skb.
+ */
+int dccp_retransmit_skb(struct sock *sk)
{
+ WARN_ON(sk->sk_send_head == NULL);
+
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
return -EHOSTUNREACH; /* Routing failure or similar. */
- return dccp_transmit_skb(sk, (skb_cloned(skb) ?
- pskb_copy(skb, GFP_ATOMIC):
- skb_clone(skb, GFP_ATOMIC)));
+ /* this count is used to distinguish original and retransmitted skb */
+ inet_csk(sk)->icsk_retransmits++;
+
+ return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
}
struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
@@ -275,151 +395,171 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst,
{
struct dccp_hdr *dh;
struct dccp_request_sock *dreq;
- const int dccp_header_size = sizeof(struct dccp_hdr) +
+ const u32 dccp_header_size = sizeof(struct dccp_hdr) +
sizeof(struct dccp_hdr_ext) +
sizeof(struct dccp_hdr_response);
- struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
- dccp_header_size, 1,
+ struct sk_buff *skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1,
GFP_ATOMIC);
if (skb == NULL)
return NULL;
/* Reserve space for headers. */
- skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+ skb_reserve(skb, sk->sk_prot->max_header);
- skb->dst = dst_clone(dst);
- skb->csum = 0;
+ skb_dst_set(skb, dst_clone(dst));
dreq = dccp_rsk(req);
+ if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
+ dccp_inc_seqno(&dreq->dreq_gss);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
- DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss;
- dccp_insert_options(sk, skb);
+ DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
+
+ /* Resolve feature dependencies resulting from choice of CCID */
+ if (dccp_feat_server_ccid_dependencies(dreq))
+ goto response_failed;
- skb->h.raw = skb_push(skb, dccp_header_size);
+ if (dccp_insert_options_rsk(dreq, skb))
+ goto response_failed;
- dh = dccp_hdr(skb);
- memset(dh, 0, dccp_header_size);
+ /* Build and checksum header */
+ dh = dccp_zeroed_hdr(skb, dccp_header_size);
- dh->dccph_sport = inet_sk(sk)->sport;
- dh->dccph_dport = inet_rsk(req)->rmt_port;
+ dh->dccph_sport = htons(inet_rsk(req)->ir_num);
+ dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
dh->dccph_doff = (dccp_header_size +
DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
dh->dccph_type = DCCP_PKT_RESPONSE;
dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dreq->dreq_iss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_isr);
+ dccp_hdr_set_seq(dh, dreq->dreq_gss);
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
- dh->dccph_checksum = dccp_v4_checksum(skb, inet_rsk(req)->loc_addr,
- inet_rsk(req)->rmt_addr);
+ dccp_csum_outgoing(skb);
+ /* We use `acked' to remember that a Response was already sent. */
+ inet_rsk(req)->acked = 1;
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
return skb;
+response_failed:
+ kfree_skb(skb);
+ return NULL;
}
EXPORT_SYMBOL_GPL(dccp_make_response);
-struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst,
- const enum dccp_reset_codes code)
-
+/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
+struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
{
- struct dccp_hdr *dh;
- struct dccp_sock *dp = dccp_sk(sk);
- const int dccp_header_size = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
- struct sk_buff *skb = sock_wmalloc(sk, MAX_HEADER + DCCP_MAX_OPT_LEN +
- dccp_header_size, 1,
- GFP_ATOMIC);
+ struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
+ struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
+ const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
+ sizeof(struct dccp_hdr_ext) +
+ sizeof(struct dccp_hdr_reset);
+ struct dccp_hdr_reset *dhr;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
if (skb == NULL)
return NULL;
- /* Reserve space for headers. */
- skb_reserve(skb, MAX_HEADER + DCCP_MAX_OPT_LEN + dccp_header_size);
+ skb_reserve(skb, sk->sk_prot->max_header);
- skb->dst = dst_clone(dst);
- skb->csum = 0;
+ /* Swap the send and the receive. */
+ dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
+ dh->dccph_type = DCCP_PKT_RESET;
+ dh->dccph_sport = rxdh->dccph_dport;
+ dh->dccph_dport = rxdh->dccph_sport;
+ dh->dccph_doff = dccp_hdr_reset_len / 4;
+ dh->dccph_x = 1;
- dccp_inc_seqno(&dp->dccps_gss);
+ dhr = dccp_hdr_reset(skb);
+ dhr->dccph_reset_code = dcb->dccpd_reset_code;
+
+ switch (dcb->dccpd_reset_code) {
+ case DCCP_RESET_CODE_PACKET_ERROR:
+ dhr->dccph_reset_data[0] = rxdh->dccph_type;
+ break;
+ case DCCP_RESET_CODE_OPTION_ERROR: /* fall through */
+ case DCCP_RESET_CODE_MANDATORY_ERROR:
+ memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3);
+ break;
+ }
+ /*
+ * From RFC 4340, 8.3.1:
+ * If P.ackno exists, set R.seqno := P.ackno + 1.
+ * Else set R.seqno := 0.
+ */
+ if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
+ dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1));
+ dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq);
- DCCP_SKB_CB(skb)->dccpd_reset_code = code;
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
- DCCP_SKB_CB(skb)->dccpd_seq = dp->dccps_gss;
- dccp_insert_options(sk, skb);
+ dccp_csum_outgoing(skb);
+ return skb;
+}
- skb->h.raw = skb_push(skb, dccp_header_size);
+EXPORT_SYMBOL_GPL(dccp_ctl_make_reset);
- dh = dccp_hdr(skb);
- memset(dh, 0, dccp_header_size);
+/* send Reset on established socket, to close or abort the connection */
+int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
+{
+ struct sk_buff *skb;
+ /*
+ * FIXME: what if rebuild_header fails?
+ * Should we be doing a rebuild_header here?
+ */
+ int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
- dh->dccph_sport = inet_sk(sk)->sport;
- dh->dccph_dport = inet_sk(sk)->dport;
- dh->dccph_doff = (dccp_header_size +
- DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dp->dccps_gss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dp->dccps_gsr);
+ if (err != 0)
+ return err;
- dccp_hdr_reset(skb)->dccph_reset_code = code;
+ skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC);
+ if (skb == NULL)
+ return -ENOBUFS;
- dh->dccph_checksum = dccp_v4_checksum(skb, inet_sk(sk)->saddr,
- inet_sk(sk)->daddr);
+ /* Reserve space for headers and prepare control bits. */
+ skb_reserve(skb, sk->sk_prot->max_header);
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
+ DCCP_SKB_CB(skb)->dccpd_reset_code = code;
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- return skb;
+ return dccp_transmit_skb(sk, skb);
}
/*
* Do all connect socket setups that can be done AF independent.
*/
-static inline void dccp_connect_init(struct sock *sk)
+int dccp_connect(struct sock *sk)
{
+ struct sk_buff *skb;
struct dccp_sock *dp = dccp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
-
- dccp_sync_mss(sk, dst_mtu(dst));
- dccp_update_gss(sk, dp->dccps_iss);
- /*
- * SWL and AWL are initially adjusted so that they are not less than
- * the initial Sequence Numbers received and sent, respectively:
- * SWL := max(GSR + 1 - floor(W/4), ISR),
- * AWL := max(GSS - W' + 1, ISS).
- * These adjustments MUST be applied only at the beginning of the
- * connection.
- */
- dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss));
-
- icsk->icsk_retransmits = 0;
-}
+ dccp_sync_mss(sk, dst_mtu(dst));
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct inet_connection_sock *icsk = inet_csk(sk);
+ /* do not connect if feature negotiation setup fails */
+ if (dccp_feat_finalise_settings(dccp_sk(sk)))
+ return -EPROTO;
- dccp_connect_init(sk);
+ /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
+ dp->dccps_gar = dp->dccps_iss;
- skb = alloc_skb(MAX_DCCP_HEADER + 15, sk->sk_allocation);
+ skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
if (unlikely(skb == NULL))
return -ENOBUFS;
/* Reserve space for headers. */
- skb_reserve(skb, MAX_DCCP_HEADER);
+ skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
- skb->csum = 0;
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL));
+ dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
/* Timer for repeating the REQUEST until an answer. */
+ icsk->icsk_retransmits = 0;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
icsk->icsk_rto, DCCP_RTO_MAX);
return 0;
@@ -431,7 +571,8 @@ void dccp_send_ack(struct sock *sk)
{
/* If we have been reset, we may not send again. */
if (sk->sk_state != DCCP_CLOSED) {
- struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+ struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
+ GFP_ATOMIC);
if (skb == NULL) {
inet_csk_schedule_ack(sk);
@@ -443,8 +584,7 @@ void dccp_send_ack(struct sock *sk)
}
/* Reserve space for headers */
- skb_reserve(skb, MAX_DCCP_HEADER);
- skb->csum = 0;
+ skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
dccp_transmit_skb(sk, skb);
}
@@ -452,6 +592,8 @@ void dccp_send_ack(struct sock *sk)
EXPORT_SYMBOL_GPL(dccp_send_ack);
+#if 0
+/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
void dccp_send_delayed_ack(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -481,8 +623,9 @@ void dccp_send_delayed_ack(struct sock *sk)
icsk->icsk_ack.timeout = timeout;
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
+#endif
-void dccp_send_sync(struct sock *sk, const u64 seq,
+void dccp_send_sync(struct sock *sk, const u64 ackno,
const enum dccp_pkt_type pkt_type)
{
/*
@@ -490,21 +633,30 @@ void dccp_send_sync(struct sock *sk, const u64 seq,
* dccp_transmit_skb() will set the ownership to this
* sock.
*/
- struct sk_buff *skb = alloc_skb(MAX_DCCP_HEADER, GFP_ATOMIC);
+ struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
- if (skb == NULL)
+ if (skb == NULL) {
/* FIXME: how to make sure the sync is sent? */
+ DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type));
return;
+ }
/* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, MAX_DCCP_HEADER);
- skb->csum = 0;
+ skb_reserve(skb, sk->sk_prot->max_header);
DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
- DCCP_SKB_CB(skb)->dccpd_seq = seq;
+ DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
+
+ /*
+ * Clear the flag in case the Sync was scheduled for out-of-band data,
+ * such as carrying a long Ack Vector.
+ */
+ dccp_sk(sk)->dccps_sync_scheduled = 0;
dccp_transmit_skb(sk, skb);
}
+EXPORT_SYMBOL_GPL(dccp_send_sync);
+
/*
* Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
* cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
@@ -522,13 +674,25 @@ void dccp_send_close(struct sock *sk, const int active)
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, sk->sk_prot->max_header);
- skb->csum = 0;
- DCCP_SKB_CB(skb)->dccpd_type = dp->dccps_role == DCCP_ROLE_CLIENT ?
- DCCP_PKT_CLOSE : DCCP_PKT_CLOSEREQ;
+ if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
+ else
+ DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
if (active) {
- dccp_skb_entail(sk, skb);
- dccp_transmit_skb(sk, skb_clone(skb, prio));
- } else
- dccp_transmit_skb(sk, skb);
+ skb = dccp_skb_entail(sk, skb);
+ /*
+ * Retransmission timer for active-close: RFC 4340, 8.3 requires
+ * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
+ * state can be left. The initial timeout is 2 RTTs.
+ * Since RTT measurement is done by the CCIDs, there is no easy
+ * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
+ * is too low (200ms); we use a high value to avoid unnecessary
+ * retransmissions when the link RTT is > 0.2 seconds.
+ * FIXME: Let main module sample RTTs and use that instead.
+ */
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
+ }
+ dccp_transmit_skb(sk, skb);
}
diff --git a/net/dccp/probe.c b/net/dccp/probe.c
new file mode 100644
index 00000000000..595ddf0459d
--- /dev/null
+++ b/net/dccp/probe.c
@@ -0,0 +1,203 @@
+/*
+ * dccp_probe - Observe the DCCP flow with kprobes.
+ *
+ * The idea for this came from Werner Almesberger's umlsim
+ * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org>
+ *
+ * Modified for DCCP from Stephen Hemminger's code
+ * Copyright (C) 2006, Ian McDonald <ian.mcdonald@jandi.co.nz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kprobes.h>
+#include <linux/socket.h>
+#include <linux/dccp.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/kfifo.h>
+#include <linux/vmalloc.h>
+#include <linux/gfp.h>
+#include <net/net_namespace.h>
+
+#include "dccp.h"
+#include "ccid.h"
+#include "ccids/ccid3.h"
+
+static int port;
+
+static int bufsize = 64 * 1024;
+
+static const char procname[] = "dccpprobe";
+
+static struct {
+ struct kfifo fifo;
+ spinlock_t lock;
+ wait_queue_head_t wait;
+ struct timespec tstart;
+} dccpw;
+
+static void printl(const char *fmt, ...)
+{
+ va_list args;
+ int len;
+ struct timespec now;
+ char tbuf[256];
+
+ va_start(args, fmt);
+ getnstimeofday(&now);
+
+ now = timespec_sub(now, dccpw.tstart);
+
+ len = sprintf(tbuf, "%lu.%06lu ",
+ (unsigned long) now.tv_sec,
+ (unsigned long) now.tv_nsec / NSEC_PER_USEC);
+ len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args);
+ va_end(args);
+
+ kfifo_in_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+ wake_up(&dccpw.wait);
+}
+
+static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk,
+ struct msghdr *msg, size_t size)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ struct ccid3_hc_tx_sock *hc = NULL;
+
+ if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
+ hc = ccid3_hc_tx_sk(sk);
+
+ if (port == 0 || ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port) {
+ if (hc)
+ printl("%pI4:%u %pI4:%u %d %d %d %d %u %llu %llu %d\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport), size,
+ hc->tx_s, hc->tx_rtt, hc->tx_p,
+ hc->tx_x_calc, hc->tx_x_recv >> 6,
+ hc->tx_x >> 6, hc->tx_t_ipi);
+ else
+ printl("%pI4:%u %pI4:%u %d\n",
+ &inet->inet_saddr, ntohs(inet->inet_sport),
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ size);
+ }
+
+ jprobe_return();
+ return 0;
+}
+
+static struct jprobe dccp_send_probe = {
+ .kp = {
+ .symbol_name = "dccp_sendmsg",
+ },
+ .entry = jdccp_sendmsg,
+};
+
+static int dccpprobe_open(struct inode *inode, struct file *file)
+{
+ kfifo_reset(&dccpw.fifo);
+ getnstimeofday(&dccpw.tstart);
+ return 0;
+}
+
+static ssize_t dccpprobe_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ int error = 0, cnt = 0;
+ unsigned char *tbuf;
+
+ if (!buf)
+ return -EINVAL;
+
+ if (len == 0)
+ return 0;
+
+ tbuf = vmalloc(len);
+ if (!tbuf)
+ return -ENOMEM;
+
+ error = wait_event_interruptible(dccpw.wait,
+ kfifo_len(&dccpw.fifo) != 0);
+ if (error)
+ goto out_free;
+
+ cnt = kfifo_out_locked(&dccpw.fifo, tbuf, len, &dccpw.lock);
+ error = copy_to_user(buf, tbuf, cnt) ? -EFAULT : 0;
+
+out_free:
+ vfree(tbuf);
+
+ return error ? error : cnt;
+}
+
+static const struct file_operations dccpprobe_fops = {
+ .owner = THIS_MODULE,
+ .open = dccpprobe_open,
+ .read = dccpprobe_read,
+ .llseek = noop_llseek,
+};
+
+static __init int dccpprobe_init(void)
+{
+ int ret = -ENOMEM;
+
+ init_waitqueue_head(&dccpw.wait);
+ spin_lock_init(&dccpw.lock);
+ if (kfifo_alloc(&dccpw.fifo, bufsize, GFP_KERNEL))
+ return ret;
+ if (!proc_create(procname, S_IRUSR, init_net.proc_net, &dccpprobe_fops))
+ goto err0;
+
+ ret = register_jprobe(&dccp_send_probe);
+ if (ret) {
+ ret = request_module("dccp");
+ if (!ret)
+ ret = register_jprobe(&dccp_send_probe);
+ }
+
+ if (ret)
+ goto err1;
+
+ pr_info("DCCP watch registered (port=%d)\n", port);
+ return 0;
+err1:
+ remove_proc_entry(procname, init_net.proc_net);
+err0:
+ kfifo_free(&dccpw.fifo);
+ return ret;
+}
+module_init(dccpprobe_init);
+
+static __exit void dccpprobe_exit(void)
+{
+ kfifo_free(&dccpw.fifo);
+ remove_proc_entry(procname, init_net.proc_net);
+ unregister_jprobe(&dccp_send_probe);
+
+}
+module_exit(dccpprobe_exit);
+
+MODULE_PARM_DESC(port, "Port to match (0=all)");
+module_param(port, int, 0);
+
+MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)");
+module_param(bufsize, int, 0);
+
+MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>");
+MODULE_DESCRIPTION("DCCP snooper");
+MODULE_LICENSE("GPL");
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 65b11ea90d8..de2c1e71930 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -9,7 +9,6 @@
* published by the Free Software Foundation.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -21,15 +20,14 @@
#include <linux/if_arp.h>
#include <linux/init.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <net/checksum.h>
-#include <net/inet_common.h>
#include <net/inet_sock.h>
-#include <net/protocol.h>
#include <net/sock.h>
#include <net/xfrm.h>
-#include <asm/semaphore.h>
+#include <asm/ioctls.h>
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/delay.h>
@@ -37,24 +35,121 @@
#include "ccid.h"
#include "dccp.h"
+#include "feat.h"
DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
EXPORT_SYMBOL_GPL(dccp_statistics);
-atomic_t dccp_orphan_count = ATOMIC_INIT(0);
-
+struct percpu_counter dccp_orphan_count;
EXPORT_SYMBOL_GPL(dccp_orphan_count);
-static struct net_protocol dccp_protocol = {
- .handler = dccp_v4_rcv,
- .err_handler = dccp_v4_err,
- .no_policy = 1,
-};
+struct inet_hashinfo dccp_hashinfo;
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+/* the maximum queue length for tx in packets. 0 is no limit */
+int sysctl_dccp_tx_qlen __read_mostly = 5;
+
+#ifdef CONFIG_IP_DCCP_DEBUG
+static const char *dccp_state_name(const int state)
+{
+ static const char *const dccp_state_names[] = {
+ [DCCP_OPEN] = "OPEN",
+ [DCCP_REQUESTING] = "REQUESTING",
+ [DCCP_PARTOPEN] = "PARTOPEN",
+ [DCCP_LISTEN] = "LISTEN",
+ [DCCP_RESPOND] = "RESPOND",
+ [DCCP_CLOSING] = "CLOSING",
+ [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
+ [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
+ [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
+ [DCCP_TIME_WAIT] = "TIME_WAIT",
+ [DCCP_CLOSED] = "CLOSED",
+ };
+
+ if (state >= DCCP_MAX_STATES)
+ return "INVALID STATE!";
+ else
+ return dccp_state_names[state];
+}
+#endif
+
+void dccp_set_state(struct sock *sk, const int state)
+{
+ const int oldstate = sk->sk_state;
+
+ dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk,
+ dccp_state_name(oldstate), dccp_state_name(state));
+ WARN_ON(state == oldstate);
+
+ switch (state) {
+ case DCCP_OPEN:
+ if (oldstate != DCCP_OPEN)
+ DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+ /* Client retransmits all Confirm options until entering OPEN */
+ if (oldstate == DCCP_PARTOPEN)
+ dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
+ break;
+
+ case DCCP_CLOSED:
+ if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
+ oldstate == DCCP_CLOSING)
+ DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+ sk->sk_prot->unhash(sk);
+ if (inet_csk(sk)->icsk_bind_hash != NULL &&
+ !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ inet_put_port(sk);
+ /* fall through */
+ default:
+ if (oldstate == DCCP_OPEN)
+ DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+ }
+
+ /* Change state AFTER socket is unhashed to avoid closed
+ * socket sitting in hash tables.
+ */
+ sk->sk_state = state;
+}
+
+EXPORT_SYMBOL_GPL(dccp_set_state);
+
+static void dccp_finish_passive_close(struct sock *sk)
+{
+ switch (sk->sk_state) {
+ case DCCP_PASSIVE_CLOSE:
+ /* Node (client or server) has received Close packet. */
+ dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+ dccp_set_state(sk, DCCP_CLOSED);
+ break;
+ case DCCP_PASSIVE_CLOSEREQ:
+ /*
+ * Client received CloseReq. We set the `active' flag so that
+ * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
+ */
+ dccp_send_close(sk, 1);
+ dccp_set_state(sk, DCCP_CLOSING);
+ }
+}
+
+void dccp_done(struct sock *sk)
+{
+ dccp_set_state(sk, DCCP_CLOSED);
+ dccp_clear_xmit_timers(sk);
+
+ sk->sk_shutdown = SHUTDOWN_MASK;
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_state_change(sk);
+ else
+ inet_csk_destroy_sock(sk);
+}
+
+EXPORT_SYMBOL_GPL(dccp_done);
const char *dccp_packet_name(const int type)
{
- static const char *dccp_packet_names[] = {
+ static const char *const dccp_packet_names[] = {
[DCCP_PKT_REQUEST] = "REQUEST",
[DCCP_PKT_RESPONSE] = "RESPONSE",
[DCCP_PKT_DATA] = "DATA",
@@ -75,39 +170,82 @@ const char *dccp_packet_name(const int type)
EXPORT_SYMBOL_GPL(dccp_packet_name);
-const char *dccp_state_name(const int state)
-{
- static char *dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
+ icsk->icsk_rto = DCCP_TIMEOUT_INIT;
+ icsk->icsk_syn_retries = sysctl_dccp_request_retries;
+ sk->sk_state = DCCP_CLOSED;
+ sk->sk_write_space = dccp_write_space;
+ icsk->icsk_sync_mss = dccp_sync_mss;
+ dp->dccps_mss_cache = 536;
+ dp->dccps_rate_last = jiffies;
+ dp->dccps_role = DCCP_ROLE_UNDEFINED;
+ dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
+ dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
+
+ dccp_init_xmit_timers(sk);
+
+ INIT_LIST_HEAD(&dp->dccps_featneg);
+ /* control socket doesn't need feat nego */
+ if (likely(ctl_sock_initialized))
+ return dccp_feat_init(sk);
+ return 0;
}
-EXPORT_SYMBOL_GPL(dccp_state_name);
+EXPORT_SYMBOL_GPL(dccp_init_sock);
-static inline int dccp_listen_start(struct sock *sk)
+void dccp_destroy_sock(struct sock *sk)
{
struct dccp_sock *dp = dccp_sk(sk);
- dp->dccps_role = DCCP_ROLE_LISTEN;
/*
- * Apps need to use setsockopt(DCCP_SOCKOPT_SERVICE)
- * before calling listen()
+ * DCCP doesn't use sk_write_queue, just sk_send_head
+ * for retransmissions
*/
- if (dccp_service_not_initialized(sk))
+ if (sk->sk_send_head != NULL) {
+ kfree_skb(sk->sk_send_head);
+ sk->sk_send_head = NULL;
+ }
+
+ /* Clean up a referenced DCCP bind bucket. */
+ if (inet_csk(sk)->icsk_bind_hash != NULL)
+ inet_put_port(sk);
+
+ kfree(dp->dccps_service_list);
+ dp->dccps_service_list = NULL;
+
+ if (dp->dccps_hc_rx_ackvec != NULL) {
+ dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+ dp->dccps_hc_rx_ackvec = NULL;
+ }
+ ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+ ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+ dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+ /* clean up feature negotiation state */
+ dccp_feat_list_purge(&dp->dccps_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_destroy_sock);
+
+static inline int dccp_listen_start(struct sock *sk, int backlog)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ dp->dccps_role = DCCP_ROLE_LISTEN;
+ /* do not start to listen if feature negotiation setup fails */
+ if (dccp_feat_finalise_settings(dp))
return -EPROTO;
- return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+ return inet_csk_listen_start(sk, backlog);
+}
+
+static inline int dccp_need_reset(int state)
+{
+ return state != DCCP_CLOSED && state != DCCP_LISTEN &&
+ state != DCCP_REQUESTING;
}
int dccp_disconnect(struct sock *sk, int flags)
@@ -120,21 +258,28 @@ int dccp_disconnect(struct sock *sk, int flags)
if (old_state != DCCP_CLOSED)
dccp_set_state(sk, DCCP_CLOSED);
- /* ABORT function of RFC793 */
+ /*
+ * This corresponds to the ABORT function of RFC793, sec. 3.8
+ * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
+ */
if (old_state == DCCP_LISTEN) {
inet_csk_listen_stop(sk);
- /* FIXME: do the active reset thing */
+ } else if (dccp_need_reset(old_state)) {
+ dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ sk->sk_err = ECONNRESET;
} else if (old_state == DCCP_REQUESTING)
sk->sk_err = ECONNRESET;
dccp_clear_xmit_timers(sk);
+
__skb_queue_purge(&sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_write_queue);
if (sk->sk_send_head != NULL) {
__kfree_skb(sk->sk_send_head);
sk->sk_send_head = NULL;
}
- inet->dport = 0;
+ inet->inet_dport = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -146,7 +291,7 @@ int dccp_disconnect(struct sock *sk, int flags)
inet_csk_delack_init(sk);
__sk_dst_reset(sk);
- BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
@@ -167,7 +312,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
unsigned int mask;
struct sock *sk = sock->sk;
- poll_wait(file, sk->sk_sleep, wait);
+ sock_poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
@@ -183,7 +328,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= POLLIN | POLLRDNORM;
+ mask |= POLLIN | POLLRDNORM | POLLRDHUP;
/* Connected? */
if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
@@ -191,7 +336,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
mask |= POLLIN | POLLRDNORM;
if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+ if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
set_bit(SOCK_ASYNC_NOSPACE,
@@ -202,7 +347,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
* wspace test but before the flags are set,
* IO signal will be lost.
*/
- if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+ if (sk_stream_is_writeable(sk))
mask |= POLLOUT | POLLWRNORM;
}
}
@@ -214,19 +359,47 @@ EXPORT_SYMBOL_GPL(dccp_poll);
int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
{
- dccp_pr_debug("entry\n");
- return -ENOIOCTLCMD;
+ int rc = -ENOTCONN;
+
+ lock_sock(sk);
+
+ if (sk->sk_state == DCCP_LISTEN)
+ goto out;
+
+ switch (cmd) {
+ case SIOCINQ: {
+ struct sk_buff *skb;
+ unsigned long amount = 0;
+
+ skb = skb_peek(&sk->sk_receive_queue);
+ if (skb != NULL) {
+ /*
+ * We will only return the amount of this packet since
+ * that is all that will be read.
+ */
+ amount = skb->len;
+ }
+ rc = put_user(amount, (int __user *)arg);
+ }
+ break;
+ default:
+ rc = -ENOIOCTLCMD;
+ break;
+ }
+out:
+ release_sock(sk);
+ return rc;
}
EXPORT_SYMBOL_GPL(dccp_ioctl);
-static int dccp_setsockopt_service(struct sock *sk, const u32 service,
- char __user *optval, int optlen)
+static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
+ char __user *optval, unsigned int optlen)
{
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_service_list *sl = NULL;
- if (service == DCCP_SERVICE_INVALID_VALUE ||
+ if (service == DCCP_SERVICE_INVALID_VALUE ||
optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
return -EINVAL;
@@ -255,19 +428,88 @@ static int dccp_setsockopt_service(struct sock *sk, const u32 service,
return 0;
}
-int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
{
- struct dccp_sock *dp;
- int err;
- int val;
+ u8 *list, len;
+ int i, rc;
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
- optname, optval,
- optlen);
+ if (cscov < 0 || cscov > 15)
+ return -EINVAL;
+ /*
+ * Populate a list of permissible values, in the range cscov...15. This
+ * is necessary since feature negotiation of single values only works if
+ * both sides incidentally choose the same value. Since the list starts
+ * lowest-value first, negotiation will pick the smallest shared value.
+ */
+ if (cscov == 0)
+ return 0;
+ len = 16 - cscov;
+
+ list = kmalloc(len, GFP_KERNEL);
+ if (list == NULL)
+ return -ENOBUFS;
+
+ for (i = 0; i < len; i++)
+ list[i] = cscov++;
+
+ rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
+
+ if (rc == 0) {
+ if (rx)
+ dccp_sk(sk)->dccps_pcrlen = cscov;
+ else
+ dccp_sk(sk)->dccps_pcslen = cscov;
+ }
+ kfree(list);
+ return rc;
+}
- if (optlen < sizeof(int))
+static int dccp_setsockopt_ccid(struct sock *sk, int type,
+ char __user *optval, unsigned int optlen)
+{
+ u8 *val;
+ int rc = 0;
+
+ if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
+ return -EINVAL;
+
+ val = memdup_user(optval, optlen);
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+
+ lock_sock(sk);
+ if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
+ rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
+
+ if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
+ rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
+ release_sock(sk);
+
+ kfree(val);
+ return rc;
+}
+
+static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+ int val, err = 0;
+
+ switch (optname) {
+ case DCCP_SOCKOPT_PACKET_SIZE:
+ DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_CHANGE_L:
+ case DCCP_SOCKOPT_CHANGE_R:
+ DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+ return 0;
+ case DCCP_SOCKOPT_CCID:
+ case DCCP_SOCKOPT_RX_CCID:
+ case DCCP_SOCKOPT_TX_CCID:
+ return dccp_setsockopt_ccid(sk, optname, optval, optlen);
+ }
+
+ if (optlen < (int)sizeof(int))
return -EINVAL;
if (get_user(val, (int __user *)optval))
@@ -277,26 +519,69 @@ int dccp_setsockopt(struct sock *sk, int level, int optname,
return dccp_setsockopt_service(sk, val, optval, optlen);
lock_sock(sk);
- dp = dccp_sk(sk);
- err = 0;
-
switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- dp->dccps_packet_size = val;
+ case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+ if (dp->dccps_role != DCCP_ROLE_SERVER)
+ err = -EOPNOTSUPP;
+ else
+ dp->dccps_server_timewait = (val != 0);
+ break;
+ case DCCP_SOCKOPT_SEND_CSCOV:
+ err = dccp_setsockopt_cscov(sk, val, false);
+ break;
+ case DCCP_SOCKOPT_RECV_CSCOV:
+ err = dccp_setsockopt_cscov(sk, val, true);
+ break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ if (sk->sk_state != DCCP_CLOSED)
+ err = -EISCONN;
+ else if (val < 0 || val >= DCCPQ_POLICY_MAX)
+ err = -EINVAL;
+ else
+ dp->dccps_qpolicy = val;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ if (val < 0)
+ err = -EINVAL;
+ else
+ dp->dccps_tx_qlen = val;
break;
default:
err = -ENOPROTOOPT;
break;
}
-
release_sock(sk);
+
return err;
}
+int dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+ optname, optval,
+ optlen);
+ return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
EXPORT_SYMBOL_GPL(dccp_setsockopt);
+#ifdef CONFIG_COMPAT
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk_compat_setsockopt(sk, level, optname,
+ optval, optlen);
+ return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
+#endif
+
static int dccp_getsockopt_service(struct sock *sk, int len,
- u32 __user *optval,
+ __be32 __user *optval,
int __user *optlen)
{
const struct dccp_sock *dp = dccp_sk(sk);
@@ -304,9 +589,6 @@ static int dccp_getsockopt_service(struct sock *sk, int len,
int err = -ENOENT, slen = 0, total_len = sizeof(u32);
lock_sock(sk);
- if (dccp_service_not_initialized(sk))
- goto out;
-
if ((sl = dp->dccps_service_list) != NULL) {
slen = sl->dccpsl_nr * sizeof(u32);
total_len += slen;
@@ -326,32 +608,57 @@ out:
return err;
}
-int dccp_getsockopt(struct sock *sk, int level, int optname,
+static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
struct dccp_sock *dp;
int val, len;
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
- optname, optval,
- optlen);
if (get_user(len, optlen))
return -EFAULT;
- if (len < sizeof(int))
+ if (len < (int)sizeof(int))
return -EINVAL;
dp = dccp_sk(sk);
switch (optname) {
case DCCP_SOCKOPT_PACKET_SIZE:
- val = dp->dccps_packet_size;
- len = sizeof(dp->dccps_packet_size);
- break;
+ DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+ return 0;
case DCCP_SOCKOPT_SERVICE:
return dccp_getsockopt_service(sk, len,
- (u32 __user *)optval, optlen);
+ (__be32 __user *)optval, optlen);
+ case DCCP_SOCKOPT_GET_CUR_MPS:
+ val = dp->dccps_mss_cache;
+ break;
+ case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+ return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
+ case DCCP_SOCKOPT_TX_CCID:
+ val = ccid_get_current_tx_ccid(dp);
+ if (val < 0)
+ return -ENOPROTOOPT;
+ break;
+ case DCCP_SOCKOPT_RX_CCID:
+ val = ccid_get_current_rx_ccid(dp);
+ if (val < 0)
+ return -ENOPROTOOPT;
+ break;
+ case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+ val = dp->dccps_server_timewait;
+ break;
+ case DCCP_SOCKOPT_SEND_CSCOV:
+ val = dp->dccps_pcslen;
+ break;
+ case DCCP_SOCKOPT_RECV_CSCOV:
+ val = dp->dccps_pcrlen;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_ID:
+ val = dp->dccps_qpolicy;
+ break;
+ case DCCP_SOCKOPT_QPOLICY_TXQLEN:
+ val = dp->dccps_tx_qlen;
+ break;
case 128 ... 191:
return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
len, (u32 __user *)optval, optlen);
@@ -362,14 +669,79 @@ int dccp_getsockopt(struct sock *sk, int level, int optname,
return -ENOPROTOOPT;
}
+ len = sizeof(val);
if (put_user(len, optlen) || copy_to_user(optval, &val, len))
return -EFAULT;
return 0;
}
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+ optname, optval,
+ optlen);
+ return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
EXPORT_SYMBOL_GPL(dccp_getsockopt);
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ if (level != SOL_DCCP)
+ return inet_csk_compat_getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
+#endif
+
+static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
+{
+ struct cmsghdr *cmsg = CMSG_FIRSTHDR(msg);
+
+ /*
+ * Assign an (opaque) qpolicy priority value to skb->priority.
+ *
+ * We are overloading this skb field for use with the qpolicy subystem.
+ * The skb->priority is normally used for the SO_PRIORITY option, which
+ * is initialised from sk_priority. Since the assignment of sk_priority
+ * to skb->priority happens later (on layer 3), we overload this field
+ * for use with queueing priorities as long as the skb is on layer 4.
+ * The default priority value (if nothing is set) is 0.
+ */
+ skb->priority = 0;
+
+ for (; cmsg != NULL; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+
+ if (!CMSG_OK(msg, cmsg))
+ return -EINVAL;
+
+ if (cmsg->cmsg_level != SOL_DCCP)
+ continue;
+
+ if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
+ !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
+ return -EINVAL;
+
+ switch (cmsg->cmsg_type) {
+ case DCCP_SCM_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
+ return -EINVAL;
+ skb->priority = *(__u32 *)CMSG_DATA(cmsg);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len)
{
@@ -384,6 +756,12 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
return -EMSGSIZE;
lock_sock(sk);
+
+ if (dccp_qpolicy_full(sk)) {
+ rc = -EAGAIN;
+ goto out_release;
+ }
+
timeo = sock_sndtimeo(sk, noblock);
/*
@@ -391,7 +769,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
* so that the trick in dccp_rcv_request_sent_state_process.
*/
/* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
+ if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
goto out_release;
@@ -407,17 +785,18 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (rc != 0)
goto out_discard;
- rc = dccp_write_xmit(sk, skb, &timeo);
+ rc = dccp_msghdr_parse(msg, skb);
+ if (rc != 0)
+ goto out_discard;
+
+ dccp_qpolicy_push(sk, skb);
/*
- * XXX we don't use sk_write_queue, so just discard the packet.
- * Current plan however is to _use_ sk_write_queue with
- * an algorith similar to tcp_sendmsg, where the main difference
- * is that in DCCP we have to respect packet boundaries, so
- * no coalescing of skbs.
- *
- * This bug was _quickly_ found & fixed by just looking at an OSTRA
- * generated callgraph 8) -acme
+ * The xmit_timer is set if the TX CCID is rate-based and will expire
+ * when congestion control permits to release further packets into the
+ * network. Window-based CCIDs do not use this timer.
*/
+ if (!timer_pending(&dp->dccps_xmit_timer))
+ dccp_write_xmit(sk);
out_release:
release_sock(sk);
return rc ? : len;
@@ -451,19 +830,26 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
dh = dccp_hdr(skb);
- if (dh->dccph_type == DCCP_PKT_DATA ||
- dh->dccph_type == DCCP_PKT_DATAACK)
+ switch (dh->dccph_type) {
+ case DCCP_PKT_DATA:
+ case DCCP_PKT_DATAACK:
goto found_ok_skb;
- if (dh->dccph_type == DCCP_PKT_RESET ||
- dh->dccph_type == DCCP_PKT_CLOSE) {
- dccp_pr_debug("found fin ok!\n");
+ case DCCP_PKT_CLOSE:
+ case DCCP_PKT_CLOSEREQ:
+ if (!(flags & MSG_PEEK))
+ dccp_finish_passive_close(sk);
+ /* fall through */
+ case DCCP_PKT_RESET:
+ dccp_pr_debug("found fin (%s) ok!\n",
+ dccp_packet_name(dh->dccph_type));
len = 0;
goto found_fin_ok;
+ default:
+ dccp_pr_debug("packet_type=%s\n",
+ dccp_packet_name(dh->dccph_type));
+ sk_eat_skb(sk, skb, false);
}
- dccp_pr_debug("packet_type=%s\n",
- dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb);
verify_sock_status:
if (sock_flag(sk, SOCK_DONE)) {
len = 0;
@@ -515,9 +901,11 @@ verify_sock_status:
len = -EFAULT;
break;
}
+ if (flags & MSG_TRUNC)
+ len = skb->len;
found_fin_ok:
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ sk_eat_skb(sk, skb, false);
break;
} while (1);
out:
@@ -551,7 +939,7 @@ int inet_dccp_listen(struct socket *sock, int backlog)
* FIXME: here it probably should be sk->sk_prot->listen_start
* see tcp_listen_start
*/
- err = dccp_listen_start(sk);
+ err = dccp_listen_start(sk, backlog);
if (err)
goto out;
}
@@ -565,33 +953,39 @@ out:
EXPORT_SYMBOL_GPL(inet_dccp_listen);
-static const unsigned char dccp_new_state[] = {
- /* current state: new state: action: */
- [0] = DCCP_CLOSED,
- [DCCP_OPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
- [DCCP_REQUESTING] = DCCP_CLOSED,
- [DCCP_PARTOPEN] = DCCP_CLOSING | DCCP_ACTION_FIN,
- [DCCP_LISTEN] = DCCP_CLOSED,
- [DCCP_RESPOND] = DCCP_CLOSED,
- [DCCP_CLOSING] = DCCP_CLOSED,
- [DCCP_TIME_WAIT] = DCCP_CLOSED,
- [DCCP_CLOSED] = DCCP_CLOSED,
-};
-
-static int dccp_close_state(struct sock *sk)
+static void dccp_terminate_connection(struct sock *sk)
{
- const int next = dccp_new_state[sk->sk_state];
- const int ns = next & DCCP_STATE_MASK;
+ u8 next_state = DCCP_CLOSED;
- if (ns != sk->sk_state)
- dccp_set_state(sk, ns);
+ switch (sk->sk_state) {
+ case DCCP_PASSIVE_CLOSE:
+ case DCCP_PASSIVE_CLOSEREQ:
+ dccp_finish_passive_close(sk);
+ break;
+ case DCCP_PARTOPEN:
+ dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ /* fall through */
+ case DCCP_OPEN:
+ dccp_send_close(sk, 1);
- return next & DCCP_ACTION_FIN;
+ if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
+ !dccp_sk(sk)->dccps_server_timewait)
+ next_state = DCCP_ACTIVE_CLOSEREQ;
+ else
+ next_state = DCCP_CLOSING;
+ /* fall through */
+ default:
+ dccp_set_state(sk, next_state);
+ }
}
void dccp_close(struct sock *sk, long timeout)
{
+ struct dccp_sock *dp = dccp_sk(sk);
struct sk_buff *skb;
+ u32 data_was_unread = 0;
+ int state;
lock_sock(sk);
@@ -606,26 +1000,50 @@ void dccp_close(struct sock *sk, long timeout)
goto adjudge_to_death;
}
+ sk_stop_timer(sk, &dp->dccps_xmit_timer);
+
/*
* We need to flush the recv. buffs. We do this only on the
* descriptor close, not protocol-sourced closes, because the
*reader process may not have drained the data yet!
*/
- /* FIXME: check for unread data */
while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ data_was_unread += skb->len;
__kfree_skb(skb);
}
- if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+ if (data_was_unread) {
+ /* Unread data was tossed, send an appropriate Reset Code */
+ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
+ dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ dccp_set_state(sk, DCCP_CLOSED);
+ } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
- } else if (dccp_close_state(sk)) {
- dccp_send_close(sk, 1);
+ } else if (sk->sk_state != DCCP_CLOSED) {
+ /*
+ * Normal connection termination. May need to wait if there are
+ * still packets in the TX queue that are delayed by the CCID.
+ */
+ dccp_flush_write_queue(sk, &timeout);
+ dccp_terminate_connection(sk);
}
+ /*
+ * Flush write queue. This may be necessary in several cases:
+ * - we have been closed by the peer but still have application data;
+ * - abortive termination (unread data or zero linger time),
+ * - normal termination but queue could not be flushed within time limit
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+
sk_stream_wait_close(sk, timeout);
adjudge_to_death:
+ state = sk->sk_state;
+ sock_hold(sk);
+ sock_orphan(sk);
+
/*
* It is the last release_sock in its life. It will remove backlog.
*/
@@ -636,35 +1054,20 @@ adjudge_to_death:
*/
local_bh_disable();
bh_lock_sock(sk);
- BUG_TRAP(!sock_owned_by_user(sk));
+ WARN_ON(sock_owned_by_user(sk));
- sock_hold(sk);
- sock_orphan(sk);
+ percpu_counter_inc(sk->sk_prot->orphan_count);
- /*
- * The last release_sock may have processed the CLOSE or RESET
- * packet moving sock to CLOSED state, if not we have to fire
- * the CLOSE/CLOSEREQ retransmission timer, see "8.3. Termination"
- * in draft-ietf-dccp-spec-11. -acme
- */
- if (sk->sk_state == DCCP_CLOSING) {
- /* FIXME: should start at 2 * RTT */
- /* Timer for repeating the CLOSE/CLOSEREQ until an answer. */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
-#if 0
- /* Yeah, we should use sk->sk_prot->orphan_count, etc */
- dccp_set_state(sk, DCCP_CLOSED);
-#endif
- }
+ /* Have we already been destroyed by a softirq or backlog? */
+ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
+ goto out;
- atomic_inc(sk->sk_prot->orphan_count);
if (sk->sk_state == DCCP_CLOSED)
inet_csk_destroy_sock(sk);
/* Otherwise, socket is reprieved until protocol close. */
+out:
bh_unlock_sock(sk);
local_bh_enable();
sock_put(sk);
@@ -674,108 +1077,22 @@ EXPORT_SYMBOL_GPL(dccp_close);
void dccp_shutdown(struct sock *sk, int how)
{
- dccp_pr_debug("entry\n");
+ dccp_pr_debug("called shutdown(%x)\n", how);
}
EXPORT_SYMBOL_GPL(dccp_shutdown);
-static const struct proto_ops inet_dccp_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
- .poll = dccp_poll,
- .ioctl = inet_ioctl,
- /* FIXME: work on inet_listen to rename it to sock_common_listen */
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-};
-
-extern struct net_proto_family inet_family_ops;
-
-static struct inet_protosw dccp_v4_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_prot,
- .ops = &inet_dccp_ops,
- .capability = -1,
- .no_check = 0,
- .flags = INET_PROTOSW_ICSK,
-};
-
-/*
- * This is the global socket data structure used for responding to
- * the Out-of-the-blue (OOTB) packets. A control sock will be created
- * for this socket at the initialization time.
- */
-struct socket *dccp_ctl_socket;
-
-static char dccp_ctl_socket_err_msg[] __initdata =
- KERN_ERR "DCCP: Failed to create the control socket.\n";
-
-static int __init dccp_ctl_sock_init(void)
-{
- int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
- &dccp_ctl_socket);
- if (rc < 0)
- printk(dccp_ctl_socket_err_msg);
- else {
- dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
- inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
-
- /* Unhash it so that IP input processing does not even
- * see it, we do not wish this socket to see incoming
- * packets.
- */
- dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
- }
-
- return rc;
-}
-
-#ifdef CONFIG_IP_DCCP_UNLOAD_HACK
-void dccp_ctl_sock_exit(void)
+static inline int dccp_mib_init(void)
{
- if (dccp_ctl_socket != NULL) {
- sock_release(dccp_ctl_socket);
- dccp_ctl_socket = NULL;
- }
+ dccp_statistics = alloc_percpu(struct dccp_mib);
+ if (!dccp_statistics)
+ return -ENOMEM;
+ return 0;
}
-EXPORT_SYMBOL_GPL(dccp_ctl_sock_exit);
-#endif
-
-static int __init init_dccp_v4_mibs(void)
+static inline void dccp_mib_exit(void)
{
- int rc = -ENOMEM;
-
- dccp_statistics[0] = alloc_percpu(struct dccp_mib);
- if (dccp_statistics[0] == NULL)
- goto out;
-
- dccp_statistics[1] = alloc_percpu(struct dccp_mib);
- if (dccp_statistics[1] == NULL)
- goto out_free_one;
-
- rc = 0;
-out:
- return rc;
-out_free_one:
- free_percpu(dccp_statistics[0]);
- dccp_statistics[0] = NULL;
- goto out;
-
+ free_percpu(dccp_statistics);
}
static int thash_entries;
@@ -783,8 +1100,8 @@ module_param(thash_entries, int, 0444);
MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
#ifdef CONFIG_IP_DCCP_DEBUG
-int dccp_debug;
-module_param(dccp_debug, int, 0444);
+bool dccp_debug;
+module_param(dccp_debug, bool, 0644);
MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
EXPORT_SYMBOL_GPL(dccp_debug);
@@ -794,17 +1111,21 @@ static int __init dccp_init(void)
{
unsigned long goal;
int ehash_order, bhash_order, i;
- int rc = proto_register(&dccp_prot, 1);
+ int rc;
+ BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
+ FIELD_SIZEOF(struct sk_buff, cb));
+ rc = percpu_counter_init(&dccp_orphan_count, 0);
if (rc)
- goto out;
-
+ goto out_fail;
+ rc = -ENOBUFS;
+ inet_hashinfo_init(&dccp_hashinfo);
dccp_hashinfo.bind_bucket_cachep =
kmem_cache_create("dccp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN, NULL, NULL);
+ SLAB_HWCACHE_ALIGN, NULL);
if (!dccp_hashinfo.bind_bucket_cachep)
- goto out_proto_unregister;
+ goto out_free_percpu;
/*
* Size and allocate the main established and bind bucket
@@ -812,10 +1133,10 @@ static int __init dccp_init(void)
*
* The methodology is similar to that of the buffer cache.
*/
- if (num_physpages >= (128 * 1024))
- goal = num_physpages >> (21 - PAGE_SHIFT);
+ if (totalram_pages >= (128 * 1024))
+ goal = totalram_pages >> (21 - PAGE_SHIFT);
else
- goal = num_physpages >> (23 - PAGE_SHIFT);
+ goal = totalram_pages >> (23 - PAGE_SHIFT);
if (thash_entries)
goal = (thash_entries *
@@ -823,26 +1144,26 @@ static int __init dccp_init(void)
for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
;
do {
- dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
+ unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
sizeof(struct inet_ehash_bucket);
- dccp_hashinfo.ehash_size >>= 1;
- while (dccp_hashinfo.ehash_size &
- (dccp_hashinfo.ehash_size - 1))
- dccp_hashinfo.ehash_size--;
+
+ while (hash_size & (hash_size - 1))
+ hash_size--;
+ dccp_hashinfo.ehash_mask = hash_size - 1;
dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
- __get_free_pages(GFP_ATOMIC, ehash_order);
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
} while (!dccp_hashinfo.ehash && --ehash_order > 0);
if (!dccp_hashinfo.ehash) {
- printk(KERN_CRIT "Failed to allocate DCCP "
- "established hash table\n");
+ DCCP_CRIT("Failed to allocate DCCP established hash table");
goto out_free_bind_bucket_cachep;
}
- for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
- rwlock_init(&dccp_hashinfo.ehash[i].lock);
- INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
- }
+ for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
+ INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
+
+ if (inet_ehash_locks_alloc(&dccp_hashinfo))
+ goto out_free_dccp_ehash;
bhash_order = ehash_order;
@@ -853,12 +1174,12 @@ static int __init dccp_init(void)
bhash_order > 0)
continue;
dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC, bhash_order);
+ __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
} while (!dccp_hashinfo.bhash && --bhash_order >= 0);
if (!dccp_hashinfo.bhash) {
- printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
- goto out_free_dccp_ehash;
+ DCCP_CRIT("Failed to allocate DCCP bind hash table");
+ goto out_free_dccp_locks;
}
for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -866,73 +1187,69 @@ static int __init dccp_init(void)
INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
}
- if (init_dccp_v4_mibs())
+ rc = dccp_mib_init();
+ if (rc)
goto out_free_dccp_bhash;
- rc = -EAGAIN;
- if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
- goto out_free_dccp_v4_mibs;
+ rc = dccp_ackvec_init();
+ if (rc)
+ goto out_free_dccp_mib;
- inet_register_protosw(&dccp_v4_protosw);
+ rc = dccp_sysctl_init();
+ if (rc)
+ goto out_ackvec_exit;
- rc = dccp_ctl_sock_init();
+ rc = ccid_initialize_builtins();
if (rc)
- goto out_unregister_protosw;
-out:
- return rc;
-out_unregister_protosw:
- inet_unregister_protosw(&dccp_v4_protosw);
- inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
-out_free_dccp_v4_mibs:
- free_percpu(dccp_statistics[0]);
- free_percpu(dccp_statistics[1]);
- dccp_statistics[0] = dccp_statistics[1] = NULL;
+ goto out_sysctl_exit;
+
+ dccp_timestamping_init();
+
+ return 0;
+
+out_sysctl_exit:
+ dccp_sysctl_exit();
+out_ackvec_exit:
+ dccp_ackvec_exit();
+out_free_dccp_mib:
+ dccp_mib_exit();
out_free_dccp_bhash:
free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
- dccp_hashinfo.bhash = NULL;
+out_free_dccp_locks:
+ inet_ehash_locks_free(&dccp_hashinfo);
out_free_dccp_ehash:
free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
- dccp_hashinfo.ehash = NULL;
out_free_bind_bucket_cachep:
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+out_free_percpu:
+ percpu_counter_destroy(&dccp_orphan_count);
+out_fail:
+ dccp_hashinfo.bhash = NULL;
+ dccp_hashinfo.ehash = NULL;
dccp_hashinfo.bind_bucket_cachep = NULL;
-out_proto_unregister:
- proto_unregister(&dccp_prot);
- goto out;
+ return rc;
}
-static const char dccp_del_proto_err_msg[] __exitdata =
- KERN_ERR "can't remove dccp net_protocol\n";
-
static void __exit dccp_fini(void)
{
- inet_unregister_protosw(&dccp_v4_protosw);
-
- if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
- printk(dccp_del_proto_err_msg);
-
- free_percpu(dccp_statistics[0]);
- free_percpu(dccp_statistics[1]);
+ ccid_cleanup_builtins();
+ dccp_mib_exit();
free_pages((unsigned long)dccp_hashinfo.bhash,
get_order(dccp_hashinfo.bhash_size *
sizeof(struct inet_bind_hashbucket)));
free_pages((unsigned long)dccp_hashinfo.ehash,
- get_order(dccp_hashinfo.ehash_size *
+ get_order((dccp_hashinfo.ehash_mask + 1) *
sizeof(struct inet_ehash_bucket)));
+ inet_ehash_locks_free(&dccp_hashinfo);
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- proto_unregister(&dccp_prot);
+ dccp_ackvec_exit();
+ dccp_sysctl_exit();
+ percpu_counter_destroy(&dccp_orphan_count);
}
module_init(dccp_init);
module_exit(dccp_fini);
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
-MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
new file mode 100644
index 00000000000..63c30bfa470
--- /dev/null
+++ b/net/dccp/qpolicy.c
@@ -0,0 +1,137 @@
+/*
+ * net/dccp/qpolicy.c
+ *
+ * Policy-based packet dequeueing interface for DCCP.
+ *
+ * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+#include "dccp.h"
+
+/*
+ * Simple Dequeueing Policy:
+ * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
+ */
+static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
+{
+ skb_queue_tail(&sk->sk_write_queue, skb);
+}
+
+static bool qpolicy_simple_full(struct sock *sk)
+{
+ return dccp_sk(sk)->dccps_tx_qlen &&
+ sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
+}
+
+static struct sk_buff *qpolicy_simple_top(struct sock *sk)
+{
+ return skb_peek(&sk->sk_write_queue);
+}
+
+/*
+ * Priority-based Dequeueing Policy:
+ * If tx_qlen is different from 0 and the queue has reached its upper bound
+ * of tx_qlen elements, replace older packets lowest-priority-first.
+ */
+static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *best = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (best == NULL || skb->priority > best->priority)
+ best = skb;
+ return best;
+}
+
+static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
+{
+ struct sk_buff *skb, *worst = NULL;
+
+ skb_queue_walk(&sk->sk_write_queue, skb)
+ if (worst == NULL || skb->priority < worst->priority)
+ worst = skb;
+ return worst;
+}
+
+static bool qpolicy_prio_full(struct sock *sk)
+{
+ if (qpolicy_simple_full(sk))
+ dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
+ return false;
+}
+
+/**
+ * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
+ * @push: add a new @skb to the write queue
+ * @full: indicates that no more packets will be admitted
+ * @top: peeks at whatever the queueing policy defines as its `top'
+ */
+static struct dccp_qpolicy_operations {
+ void (*push) (struct sock *sk, struct sk_buff *skb);
+ bool (*full) (struct sock *sk);
+ struct sk_buff* (*top) (struct sock *sk);
+ __be32 params;
+
+} qpol_table[DCCPQ_POLICY_MAX] = {
+ [DCCPQ_POLICY_SIMPLE] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_simple_full,
+ .top = qpolicy_simple_top,
+ .params = 0,
+ },
+ [DCCPQ_POLICY_PRIO] = {
+ .push = qpolicy_simple_push,
+ .full = qpolicy_prio_full,
+ .top = qpolicy_prio_best_skb,
+ .params = DCCP_SCM_PRIORITY,
+ },
+};
+
+/*
+ * Externally visible interface
+ */
+void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
+{
+ qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
+}
+
+bool dccp_qpolicy_full(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
+}
+
+void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
+{
+ if (skb != NULL) {
+ skb_unlink(skb, &sk->sk_write_queue);
+ kfree_skb(skb);
+ }
+}
+
+struct sk_buff *dccp_qpolicy_top(struct sock *sk)
+{
+ return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
+}
+
+struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
+{
+ struct sk_buff *skb = dccp_qpolicy_top(sk);
+
+ if (skb != NULL) {
+ /* Clear any skb fields that we used internally */
+ skb->priority = 0;
+ skb_unlink(skb, &sk->sk_write_queue);
+ }
+ return skb;
+}
+
+bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
+{
+ /* check if exactly one bit is set */
+ if (!param || (param & (param - 1)))
+ return false;
+ return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
+}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
new file mode 100644
index 00000000000..53731e45403
--- /dev/null
+++ b/net/dccp/sysctl.c
@@ -0,0 +1,118 @@
+/*
+ * net/dccp/sysctl.c
+ *
+ * An implementation of the DCCP protocol
+ * Arnaldo Carvalho de Melo <acme@mandriva.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License v2
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include "dccp.h"
+#include "feat.h"
+
+#ifndef CONFIG_SYSCTL
+#error This file should not be compiled without CONFIG_SYSCTL defined
+#endif
+
+/* Boundary values */
+static int zero = 0,
+ one = 1,
+ u8_max = 0xFF;
+static unsigned long seqw_min = DCCPF_SEQ_WMIN,
+ seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
+
+static struct ctl_table dccp_default_table[] = {
+ {
+ .procname = "seq_window",
+ .data = &sysctl_dccp_sequence_window,
+ .maxlen = sizeof(sysctl_dccp_sequence_window),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
+ .extra2 = &seqw_max,
+ },
+ {
+ .procname = "rx_ccid",
+ .data = &sysctl_dccp_rx_ccid,
+ .maxlen = sizeof(sysctl_dccp_rx_ccid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max, /* RFC 4340, 10. */
+ },
+ {
+ .procname = "tx_ccid",
+ .data = &sysctl_dccp_tx_ccid,
+ .maxlen = sizeof(sysctl_dccp_tx_ccid),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max, /* RFC 4340, 10. */
+ },
+ {
+ .procname = "request_retries",
+ .data = &sysctl_dccp_request_retries,
+ .maxlen = sizeof(sysctl_dccp_request_retries),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &u8_max,
+ },
+ {
+ .procname = "retries1",
+ .data = &sysctl_dccp_retries1,
+ .maxlen = sizeof(sysctl_dccp_retries1),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max,
+ },
+ {
+ .procname = "retries2",
+ .data = &sysctl_dccp_retries2,
+ .maxlen = sizeof(sysctl_dccp_retries2),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &u8_max,
+ },
+ {
+ .procname = "tx_qlen",
+ .data = &sysctl_dccp_tx_qlen,
+ .maxlen = sizeof(sysctl_dccp_tx_qlen),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ },
+ {
+ .procname = "sync_ratelimit",
+ .data = &sysctl_dccp_sync_ratelimit,
+ .maxlen = sizeof(sysctl_dccp_sync_ratelimit),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+
+ { }
+};
+
+static struct ctl_table_header *dccp_table_header;
+
+int __init dccp_sysctl_init(void)
+{
+ dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
+ dccp_default_table);
+
+ return dccp_table_header != NULL ? 0 : -ENOMEM;
+}
+
+void dccp_sysctl_exit(void)
+{
+ if (dccp_table_header != NULL) {
+ unregister_net_sysctl_table(dccp_table_header);
+ dccp_table_header = NULL;
+ }
+}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
index aa34b576e22..1cd46a345cb 100644
--- a/net/dccp/timer.c
+++ b/net/dccp/timer.c
@@ -1,6 +1,6 @@
/*
* net/dccp/timer.c
- *
+ *
* An implementation of the DCCP protocol
* Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
@@ -10,28 +10,23 @@
* 2 of the License, or (at your option) any later version.
*/
-#include <linux/config.h>
#include <linux/dccp.h>
#include <linux/skbuff.h>
+#include <linux/export.h>
#include "dccp.h"
-static void dccp_write_timer(unsigned long data);
-static void dccp_keepalive_timer(unsigned long data);
-static void dccp_delack_timer(unsigned long data);
-
-void dccp_init_xmit_timers(struct sock *sk)
-{
- inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
- &dccp_keepalive_timer);
-}
+/* sysctl variables governing numbers of retransmission attempts */
+int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES;
+int sysctl_dccp_retries1 __read_mostly = TCP_RETR1;
+int sysctl_dccp_retries2 __read_mostly = TCP_RETR2;
static void dccp_write_err(struct sock *sk)
{
sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
sk->sk_error_report(sk);
- dccp_v4_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+ dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
dccp_done(sk);
DCCP_INC_STATS_BH(DCCP_MIB_ABORTONTIMEOUT);
}
@@ -44,12 +39,11 @@ static int dccp_write_timeout(struct sock *sk)
if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
if (icsk->icsk_retransmits != 0)
- dst_negative_advice(&sk->sk_dst_cache);
- retry_until = icsk->icsk_syn_retries ? :
- /* FIXME! */ 3 /* FIXME! sysctl_tcp_syn_retries */;
+ dst_negative_advice(sk);
+ retry_until = icsk->icsk_syn_retries ?
+ : sysctl_dccp_request_retries;
} else {
- if (icsk->icsk_retransmits >=
- /* FIXME! sysctl_tcp_retries1 */ 5 /* FIXME! */) {
+ if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
black hole detection. :-(
@@ -67,13 +61,13 @@ static int dccp_write_timeout(struct sock *sk)
be far nicer to have all of the black holes fixed rather than fixing
all of the TCP implementations."
- Golden words :-).
+ Golden words :-).
*/
- dst_negative_advice(&sk->sk_dst_cache);
+ dst_negative_advice(sk);
}
- retry_until = /* FIXME! */ 15 /* FIXME! sysctl_tcp_retries2 */;
+ retry_until = sysctl_dccp_retries2;
/*
* FIXME: see tcp_write_timout and tcp_out_of_resources
*/
@@ -87,53 +81,6 @@ static int dccp_write_timeout(struct sock *sk)
return 0;
}
-/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
-static void dccp_delack_timer(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- icsk->icsk_ack.blocked = 1;
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- jiffies + TCP_DELACK_MIN);
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED ||
- !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
- if (time_after(icsk->icsk_ack.timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- icsk->icsk_ack.timeout);
- goto out;
- }
-
- icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
-
- if (inet_csk_ack_scheduled(sk)) {
- if (!icsk->icsk_ack.pingpong) {
- /* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
- icsk->icsk_rto);
- } else {
- /* Delayed ACK missed: leave pingpong mode and
- * deflate ATO.
- */
- icsk->icsk_ack.pingpong = 0;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- }
- dccp_send_ack(sk);
- NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
/*
* The DCCP retransmit timer.
*/
@@ -142,19 +89,11 @@ static void dccp_retransmit_timer(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
/*
- * sk->sk_send_head has to have one skb with
- * DCCP_SKB_CB(skb)->dccpd_type set to one of the retransmittable DCCP
- * packet types (REQUEST, RESPONSE, the ACK in the 3way handshake
- * (PARTOPEN timer), etc).
- */
- BUG_TRAP(sk->sk_send_head != NULL);
-
- /*
* More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
* sent, no need to retransmit, this sock is dead.
*/
if (dccp_write_timeout(sk))
- goto out;
+ return;
/*
* We want to know the number of packets retransmitted, not the
@@ -163,29 +102,27 @@ static void dccp_retransmit_timer(struct sock *sk)
if (icsk->icsk_retransmits == 0)
DCCP_INC_STATS_BH(DCCP_MIB_TIMEOUTS);
- if (dccp_retransmit_skb(sk, sk->sk_send_head) < 0) {
+ if (dccp_retransmit_skb(sk) != 0) {
/*
* Retransmission failed because of local congestion,
* do not backoff.
*/
- if (icsk->icsk_retransmits == 0)
+ if (--icsk->icsk_retransmits == 0)
icsk->icsk_retransmits = 1;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
min(icsk->icsk_rto,
TCP_RESOURCE_PROBE_INTERVAL),
DCCP_RTO_MAX);
- goto out;
+ return;
}
icsk->icsk_backoff++;
- icsk->icsk_retransmits++;
icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
DCCP_RTO_MAX);
- if (icsk->icsk_retransmits > 3 /* FIXME: sysctl_dccp_retries1 */)
+ if (icsk->icsk_retransmits > sysctl_dccp_retries1)
__sk_dst_reset(sk);
-out:;
}
static void dccp_write_timer(unsigned long data)
@@ -240,7 +177,7 @@ static void dccp_keepalive_timer(unsigned long data)
/* Only process if socket is not in use. */
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
- /* Try again later. */
+ /* Try again later. */
inet_csk_reset_keepalive_timer(sk, HZ / 20);
goto out;
}
@@ -253,3 +190,104 @@ out:
bh_unlock_sock(sk);
sock_put(sk);
}
+
+/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
+static void dccp_delack_timer(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk)) {
+ /* Try again later. */
+ icsk->icsk_ack.blocked = 1;
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ jiffies + TCP_DELACK_MIN);
+ goto out;
+ }
+
+ if (sk->sk_state == DCCP_CLOSED ||
+ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ goto out;
+ if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk->icsk_ack.timeout);
+ goto out;
+ }
+
+ icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
+
+ if (inet_csk_ack_scheduled(sk)) {
+ if (!icsk->icsk_ack.pingpong) {
+ /* Delayed ACK missed: inflate ATO. */
+ icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
+ icsk->icsk_rto);
+ } else {
+ /* Delayed ACK missed: leave pingpong mode and
+ * deflate ATO.
+ */
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
+ dccp_send_ack(sk);
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+ }
+out:
+ bh_unlock_sock(sk);
+ sock_put(sk);
+}
+
+/**
+ * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
+ * See the comments above %ccid_dequeueing_decision for supported modes.
+ */
+static void dccp_write_xmitlet(unsigned long data)
+{
+ struct sock *sk = (struct sock *)data;
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
+ else
+ dccp_write_xmit(sk);
+ bh_unlock_sock(sk);
+}
+
+static void dccp_write_xmit_timer(unsigned long data)
+{
+ dccp_write_xmitlet(data);
+ sock_put((struct sock *)data);
+}
+
+void dccp_init_xmit_timers(struct sock *sk)
+{
+ struct dccp_sock *dp = dccp_sk(sk);
+
+ tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
+ setup_timer(&dp->dccps_xmit_timer, dccp_write_xmit_timer,
+ (unsigned long)sk);
+ inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
+ &dccp_keepalive_timer);
+}
+
+static ktime_t dccp_timestamp_seed;
+/**
+ * dccp_timestamp - 10s of microseconds time source
+ * Returns the number of 10s of microseconds since loading DCCP. This is native
+ * DCCP time difference format (RFC 4340, sec. 13).
+ * Please note: This will wrap around about circa every 11.9 hours.
+ */
+u32 dccp_timestamp(void)
+{
+ u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
+
+ do_div(delta, 10);
+ return delta;
+}
+EXPORT_SYMBOL_GPL(dccp_timestamp);
+
+void __init dccp_timestamping_init(void)
+{
+ dccp_timestamp_seed = ktime_get_real();
+}