diff options
author | David S. Miller <davem@davemloft.net> | 2008-09-08 17:28:59 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2008-09-08 17:28:59 -0700 |
commit | 0a68a20cc3eafa73bb54097c28b921147d7d3685 (patch) | |
tree | 8e5f315226b618cb8e050a0c7653c8ec134501e3 | |
parent | 17dce5dfe38ae2fb359b61e855f5d8a3a8b7892b (diff) | |
parent | a3cbdde8e9c38b66b4f13ac5d6ff1939ded0ff20 (diff) |
Merge branch 'dccp' of git://eden-feed.erg.abdn.ac.uk/dccp_exp
Conflicts:
net/dccp/input.c
net/dccp/options.c
36 files changed, 3971 insertions, 2884 deletions
diff --git a/Documentation/networking/dccp.txt b/Documentation/networking/dccp.txt index 39131a3c78f..fcfc1253442 100644 --- a/Documentation/networking/dccp.txt +++ b/Documentation/networking/dccp.txt @@ -45,6 +45,25 @@ http://linux-net.osdl.org/index.php/DCCP_Testing#Experimental_DCCP_source_tree Socket options ============== +DCCP_SOCKOPT_QPOLICY_ID sets the dequeuing policy for outgoing packets. It takes +a policy ID as argument and can only be set before the connection (i.e. changes +during an established connection are not supported). Currently, two policies are +defined: the "simple" policy (DCCPQ_POLICY_SIMPLE), which does nothing special, +and a priority-based variant (DCCPQ_POLICY_PRIO). The latter allows to pass an +u32 priority value as ancillary data to sendmsg(), where higher numbers indicate +a higher packet priority (similar to SO_PRIORITY). This ancillary data needs to +be formatted using a cmsg(3) message header filled in as follows: + cmsg->cmsg_level = SOL_DCCP; + cmsg->cmsg_type = DCCP_SCM_PRIORITY; + cmsg->cmsg_len = CMSG_LEN(sizeof(uint32_t)); /* or CMSG_LEN(4) */ + +DCCP_SOCKOPT_QPOLICY_TXQLEN sets the maximum length of the output queue. A zero +value is always interpreted as unbounded queue length. If different from zero, +the interpretation of this parameter depends on the current dequeuing policy +(see above): the "simple" policy will enforce a fixed queue size by returning +EAGAIN, whereas the "prio" policy enforces a fixed queue length by dropping the +lowest-priority packet first. The default value for this parameter is +initialised from /proc/sys/net/dccp/default/tx_qlen. DCCP_SOCKOPT_SERVICE sets the service. The specification mandates use of service codes (RFC 4340, sec. 8.1.2); if this socket option is not set, @@ -57,6 +76,24 @@ can be set before calling bind(). DCCP_SOCKOPT_GET_CUR_MPS is read-only and retrieves the current maximum packet size (application payload size) in bytes, see RFC 4340, section 14. +DCCP_SOCKOPT_AVAILABLE_CCIDS is also read-only and returns the list of CCIDs +supported by the endpoint (see include/linux/dccp.h for symbolic constants). +The caller needs to provide a sufficiently large (> 2) array of type uint8_t. + +DCCP_SOCKOPT_CCID is write-only and sets both the TX and RX CCIDs at the same +time, combining the operation of the next two socket options. This option is +preferrable over the latter two, since often applications will use the same +type of CCID for both directions; and mixed use of CCIDs is not currently well +understood. This socket option takes as argument at least one uint8_t value, or +an array of uint8_t values, which must match available CCIDS (see above). CCIDs +must be registered on the socket before calling connect() or listen(). + +DCCP_SOCKOPT_TX_CCID is read/write. It returns the current CCID (if set) or sets +the preference list for the TX CCID, using the same format as DCCP_SOCKOPT_CCID. +Please note that the getsockopt argument type here is `int', not uint8_t. + +DCCP_SOCKOPT_RX_CCID is analogous to DCCP_SOCKOPT_TX_CCID, but for the RX CCID. + DCCP_SOCKOPT_SERVER_TIMEWAIT enables the server (listening socket) to hold timewait state when closing the connection (RFC 4340, 8.3). The usual case is that the closing server sends a CloseReq, whereupon the client holds timewait @@ -115,23 +152,16 @@ retries2 importance for retransmitted acknowledgments and feature negotiation, data packets are never retransmitted. Analogue of tcp_retries2. -send_ndp = 1 - Whether or not to send NDP count options (sec. 7.7.2). - -send_ackvec = 1 - Whether or not to send Ack Vector options (sec. 11.5). - -ack_ratio = 2 - The default Ack Ratio (sec. 11.3) to use. - tx_ccid = 2 - Default CCID for the sender-receiver half-connection. + Default CCID for the sender-receiver half-connection. Depending on the + choice of CCID, the Send Ack Vector feature is enabled automatically. rx_ccid = 2 - Default CCID for the receiver-sender half-connection. + Default CCID for the receiver-sender half-connection; see tx_ccid. seq_window = 100 - The initial sequence window (sec. 7.5.2). + The initial sequence window (sec. 7.5.2) of the sender. This influences + the local ackno validity and the remote seqno validity windows (7.5.1). tx_qlen = 5 The size of the transmit buffer in packets. A value of 0 corresponds diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 6080449fbec..010e2d87ed7 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -165,9 +165,13 @@ enum { DCCPO_TIMESTAMP_ECHO = 42, DCCPO_ELAPSED_TIME = 43, DCCPO_MAX = 45, - DCCPO_MIN_CCID_SPECIFIC = 128, - DCCPO_MAX_CCID_SPECIFIC = 255, + DCCPO_MIN_RX_CCID_SPECIFIC = 128, /* from sender to receiver */ + DCCPO_MAX_RX_CCID_SPECIFIC = 191, + DCCPO_MIN_TX_CCID_SPECIFIC = 192, /* from receiver to sender */ + DCCPO_MAX_TX_CCID_SPECIFIC = 255, }; +/* maximum size of a single TLV-encoded DCCP option (sans type/len bytes) */ +#define DCCP_SINGLE_OPT_MAXLEN 253 /* DCCP CCIDS */ enum { @@ -176,27 +180,36 @@ enum { }; /* DCCP features (RFC 4340 section 6.4) */ -enum { +enum dccp_feature_numbers { DCCPF_RESERVED = 0, DCCPF_CCID = 1, - DCCPF_SHORT_SEQNOS = 2, /* XXX: not yet implemented */ + DCCPF_SHORT_SEQNOS = 2, DCCPF_SEQUENCE_WINDOW = 3, - DCCPF_ECN_INCAPABLE = 4, /* XXX: not yet implemented */ + DCCPF_ECN_INCAPABLE = 4, DCCPF_ACK_RATIO = 5, DCCPF_SEND_ACK_VECTOR = 6, DCCPF_SEND_NDP_COUNT = 7, DCCPF_MIN_CSUM_COVER = 8, - DCCPF_DATA_CHECKSUM = 9, /* XXX: not yet implemented */ + DCCPF_DATA_CHECKSUM = 9, /* 10-127 reserved */ DCCPF_MIN_CCID_SPECIFIC = 128, + DCCPF_SEND_LEV_RATE = 192, /* RFC 4342, sec. 8.4 */ DCCPF_MAX_CCID_SPECIFIC = 255, }; -/* this structure is argument to DCCP_SOCKOPT_CHANGE_X */ -struct dccp_so_feat { - __u8 dccpsf_feat; - __u8 __user *dccpsf_val; - __u8 dccpsf_len; +/* DCCP socket control message types for cmsg */ +enum dccp_cmsg_type { + DCCP_SCM_PRIORITY = 1, + DCCP_SCM_QPOLICY_MAX = 0xFFFF, + /* ^-- Up to here reserved exclusively for qpolicy parameters */ + DCCP_SCM_MAX +}; + +/* DCCP priorities for outgoing/queued packets */ +enum dccp_packet_dequeueing_policy { + DCCPQ_POLICY_SIMPLE, + DCCPQ_POLICY_PRIO, + DCCPQ_POLICY_MAX }; /* DCCP socket options */ @@ -208,6 +221,12 @@ struct dccp_so_feat { #define DCCP_SOCKOPT_SERVER_TIMEWAIT 6 #define DCCP_SOCKOPT_SEND_CSCOV 10 #define DCCP_SOCKOPT_RECV_CSCOV 11 +#define DCCP_SOCKOPT_AVAILABLE_CCIDS 12 +#define DCCP_SOCKOPT_CCID 13 +#define DCCP_SOCKOPT_TX_CCID 14 +#define DCCP_SOCKOPT_RX_CCID 15 +#define DCCP_SOCKOPT_QPOLICY_ID 16 +#define DCCP_SOCKOPT_QPOLICY_TXQLEN 17 #define DCCP_SOCKOPT_CCID_RX_INFO 128 #define DCCP_SOCKOPT_CCID_TX_INFO 192 @@ -355,62 +374,13 @@ static inline unsigned int dccp_hdr_len(const struct sk_buff *skb) return __dccp_hdr_len(dccp_hdr(skb)); } - -/* initial values for each feature */ -#define DCCPF_INITIAL_SEQUENCE_WINDOW 100 -#define DCCPF_INITIAL_ACK_RATIO 2 -#define DCCPF_INITIAL_CCID DCCPC_CCID2 -#define DCCPF_INITIAL_SEND_ACK_VECTOR 1 -/* FIXME: for now we're default to 1 but it should really be 0 */ -#define DCCPF_INITIAL_SEND_NDP_COUNT 1 - -/** - * struct dccp_minisock - Minimal DCCP connection representation - * - * Will be used to pass the state from dccp_request_sock to dccp_sock. - * - * @dccpms_sequence_window - Sequence Window Feature (section 7.5.2) - * @dccpms_ccid - Congestion Control Id (CCID) (section 10) - * @dccpms_send_ack_vector - Send Ack Vector Feature (section 11.5) - * @dccpms_send_ndp_count - Send NDP Count Feature (7.7.2) - * @dccpms_ack_ratio - Ack Ratio Feature (section 11.3) - * @dccpms_pending - List of features being negotiated - * @dccpms_conf - - */ -struct dccp_minisock { - __u64 dccpms_sequence_window; - __u8 dccpms_rx_ccid; - __u8 dccpms_tx_ccid; - __u8 dccpms_send_ack_vector; - __u8 dccpms_send_ndp_count; - __u8 dccpms_ack_ratio; - struct list_head dccpms_pending; - struct list_head dccpms_conf; -}; - -struct dccp_opt_conf { - __u8 *dccpoc_val; - __u8 dccpoc_len; -}; - -struct dccp_opt_pend { - struct list_head dccpop_node; - __u8 dccpop_type; - __u8 dccpop_feat; - __u8 *dccpop_val; - __u8 dccpop_len; - int dccpop_conf; - struct dccp_opt_conf *dccpop_sc; -}; - -extern void dccp_minisock_init(struct dccp_minisock *dmsk); - /** * struct dccp_request_sock - represent DCCP-specific connection request * @dreq_inet_rsk: structure inherited from * @dreq_iss: initial sequence number sent on the Response (RFC 4340, 7.1) * @dreq_isr: initial sequence number received on the Request * @dreq_service: service code present on the Request (there is just one) + * @dreq_featneg: feature negotiation options for this connection * The following two fields are analogous to the ones in dccp_sock: * @dreq_timestamp_echo: last received timestamp to echo (13.1) * @dreq_timestamp_echo: the time of receiving the last @dreq_timestamp_echo @@ -420,6 +390,7 @@ struct dccp_request_sock { __u64 dreq_iss; __u64 dreq_isr; __be32 dreq_service; + struct list_head dreq_featneg; __u32 dreq_timestamp_echo; __u32 dreq_timestamp_time; }; @@ -491,21 +462,28 @@ struct dccp_ackvec; * @dccps_timestamp_time - time of receiving latest @dccps_timestamp_echo * @dccps_l_ack_ratio - feature-local Ack Ratio * @dccps_r_ack_ratio - feature-remote Ack Ratio + * @dccps_l_seq_win - local Sequence Window (influences ack number validity) + * @dccps_r_seq_win - remote Sequence Window (influences seq number validity) * @dccps_pcslen - sender partial checksum coverage (via sockopt) * @dccps_pcrlen - receiver partial checksum coverage (via sockopt) + * @dccps_send_ndp_count - local Send NDP Count feature (7.7.2) * @dccps_ndp_count - number of Non Data Packets since last data packet * @dccps_mss_cache - current value of MSS (path MTU minus header sizes) * @dccps_rate_last - timestamp for rate-limiting DCCP-Sync (RFC 4340, 7.5.4) - * @dccps_minisock - associated minisock (accessed via dccp_msk) + * @dccps_featneg - tracks feature-negotiation state (mostly during handshake) * @dccps_hc_rx_ackvec - rx half connection ack vector * @dccps_hc_rx_ccid - CCID used for the receiver (or receiving half-connection) * @dccps_hc_tx_ccid - CCID used for the sender (or sending half-connection) * @dccps_options_received - parsed set of retrieved options + * @dccps_qpolicy - TX dequeueing policy, one of %dccp_packet_dequeueing_policy + * @dccps_tx_qlen - maximum length of the TX queue * @dccps_role - role of this sock, one of %dccp_role * @dccps_hc_rx_insert_options - receiver wants to add options when acking * @dccps_hc_tx_insert_options - sender wants to add options when sending * @dccps_server_timewait - server holds timewait state on close (RFC 4340, 8.3) - * @dccps_xmit_timer - timer for when CCID is not ready to send + * @dccps_sync_scheduled - flag which signals "send out-of-band message soon" + * @dccps_xmitlet - tasklet scheduled by the TX CCID to dequeue data packets + * @dccps_xmit_timer - used by the TX CCID to delay sending (rate-based pacing) * @dccps_syn_rtt - RTT sample from Request/Response exchange (in usecs) */ struct dccp_sock { @@ -529,19 +507,26 @@ struct dccp_sock { __u32 dccps_timestamp_time; __u16 dccps_l_ack_ratio; __u16 dccps_r_ack_ratio; - __u16 dccps_pcslen; - __u16 dccps_pcrlen; + __u64 dccps_l_seq_win:48; + __u64 dccps_r_seq_win:48; + __u8 dccps_pcslen:4; + __u8 dccps_pcrlen:4; + __u8 dccps_send_ndp_count:1; __u64 dccps_ndp_count:48; unsigned long dccps_rate_last; - struct dccp_minisock dccps_minisock; + struct list_head dccps_featneg; struct dccp_ackvec *dccps_hc_rx_ackvec; struct ccid *dccps_hc_rx_ccid; struct ccid *dccps_hc_tx_ccid; struct dccp_options_received dccps_options_received; + __u8 dccps_qpolicy; + __u32 dccps_tx_qlen; enum dccp_role dccps_role:2; __u8 dccps_hc_rx_insert_options:1; __u8 dccps_hc_tx_insert_options:1; __u8 dccps_server_timewait:1; + __u8 dccps_sync_scheduled:1; + struct tasklet_struct dccps_xmitlet; struct timer_list dccps_xmit_timer; }; @@ -550,11 +535,6 @@ static inline struct dccp_sock *dccp_sk(const struct sock *sk) return (struct dccp_sock *)sk; } -static inline struct dccp_minisock *dccp_msk(const struct sock *sk) -{ - return (struct dccp_minisock *)&dccp_sk(sk)->dccps_minisock; -} - static inline const char *dccp_role(const struct sock *sk) { switch (dccp_sk(sk)->dccps_role) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 8983386356a..6bc4b8148ca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -782,6 +782,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) +/* + * Convert RFC3390 larger initial windows into an equivalent number of packets. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than a multiplier in the relevant range. + */ +static inline u32 rfc3390_bytes_to_packets(const u32 bytes) +{ + return bytes <= 1095 ? 4 : (bytes > 1460 ? 2 : 3); +} + extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 7aa2a7acc7e..206c16ad9c3 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -25,9 +25,6 @@ config INET_DCCP_DIAG def_tristate y if (IP_DCCP = y && INET_DIAG = y) def_tristate m -config IP_DCCP_ACKVEC - bool - source "net/dccp/ccids/Kconfig" menu "DCCP Kernel Hacking" diff --git a/net/dccp/Makefile b/net/dccp/Makefile index f4f8793aaff..0c1c9af2bf7 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o -dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o +dccp-y := ccid.o feat.o input.o minisocks.o options.o \ + qpolicy.o output.o proto.o timer.o ackvec.o dccp_ipv4-y := ipv4.o @@ -8,8 +9,6 @@ dccp_ipv4-y := ipv4.o obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o dccp_ipv6-y := ipv6.o -dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o - obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1e8be246ad1..41819848bdd 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -1,445 +1,375 @@ /* * net/dccp/ackvec.c * - * An implementation of the DCCP protocol + * An implementation of Ack Vectors for the DCCP protocol + * Copyright (c) 2007 University of Aberdeen, Scotland, UK * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; version 2 of the License; */ - -#include "ackvec.h" #include "dccp.h" - -#include <linux/dccp.h> -#include <linux/init.h> -#include <linux/errno.h> #include <linux/kernel.h> -#include <linux/skbuff.h> #include <linux/slab.h> -#include <net/sock.h> - static struct kmem_cache *dccp_ackvec_slab; static struct kmem_cache *dccp_ackvec_record_slab; -static struct dccp_ackvec_record *dccp_ackvec_record_new(void) +struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) { - struct dccp_ackvec_record *avr = - kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); + struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority); - if (avr != NULL) - INIT_LIST_HEAD(&avr->avr_node); - - return avr; + if (av != NULL) { + av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1; + INIT_LIST_HEAD(&av->av_records); + } + return av; } -static void dccp_ackvec_record_delete(struct dccp_ackvec_record *avr) +static void dccp_ackvec_purge_records(struct dccp_ackvec *av) { - if (unlikely(avr == NULL)) - return; - /* Check if deleting a linked record */ - WARN_ON(!list_empty(&avr->avr_node)); - kmem_cache_free(dccp_ackvec_record_slab, avr); + struct dccp_ackvec_record *cur, *next; + + list_for_each_entry_safe(cur, next, &av->av_records, avr_node) + kmem_cache_free(dccp_ackvec_record_slab, cur); + INIT_LIST_HEAD(&av->av_records); } -static void dccp_ackvec_insert_avr(struct dccp_ackvec *av, - struct dccp_ackvec_record *avr) +void dccp_ackvec_free(struct dccp_ackvec *av) { - /* - * AVRs are sorted by seqno. Since we are sending them in order, we - * just add the AVR at the head of the list. - * -sorbo. - */ - if (!list_empty(&av->av_records)) { - const struct dccp_ackvec_record *head = - list_entry(av->av_records.next, - struct dccp_ackvec_record, - avr_node); - BUG_ON(before48(avr->avr_ack_seqno, head->avr_ack_seqno)); + if (likely(av != NULL)) { + dccp_ackvec_purge_records(av); + kmem_cache_free(dccp_ackvec_slab, av); } - - list_add(&avr->avr_node, &av->av_records); } -int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) +/** + * dccp_ackvec_update_records - Record information about sent Ack Vectors + * @av: Ack Vector records to update + * @seqno: Sequence number of the packet carrying the Ack Vector just sent + * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector + */ +int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum) { - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; - /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); - u16 len = av->av_vec_len + 2 * nr_opts, i; - u32 elapsed_time; - const unsigned char *tail, *from; - unsigned char *to; struct dccp_ackvec_record *avr; - suseconds_t delta; - - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) - return -1; - - delta = ktime_us_delta(ktime_get_real(), av->av_time); - elapsed_time = delta / 10; - if (elapsed_time != 0 && - dccp_insert_option_elapsed_time(sk, skb, elapsed_time)) - return -1; - - avr = dccp_ackvec_record_new(); + avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC); if (avr == NULL) - return -1; - - DCCP_SKB_CB(skb)->dccpd_opt_len += len; - - to = skb_push(skb, len); - len = av->av_vec_len; - from = av->av_buf + av->av_buf_head; - tail = av->av_buf + DCCP_MAX_ACKVEC_LEN; - - for (i = 0; i < nr_opts; ++i) { - int copylen = len; - - if (len > DCCP_MAX_ACKVEC_OPT_LEN) - copylen = DCCP_MAX_ACKVEC_OPT_LEN; - - *to++ = DCCPO_ACK_VECTOR_0; - *to++ = copylen + 2; - - /* Check if buf_head wraps */ - if (from + copylen > tail) { - const u16 tailsize = tail - from; - - memcpy(to, from, tailsize); - to += tailsize; - len -= tailsize; - copylen -= tailsize; - from = av->av_buf; - } - - memcpy(to, from, copylen); - from += copylen; - to += copylen; - len -= copylen; - } + return -ENOBUFS; + avr->avr_ack_seqno = seqno; + avr->avr_ack_ptr = av->av_buf_head; + avr->avr_ack_ackno = av->av_buf_ackno; + avr->avr_ack_nonce = nonce_sum; + avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head); /* - * From RFC 4340, A.2: - * - * For each acknowledgement it sends, the HC-Receiver will add an - * acknowledgement record. ack_seqno will equal the HC-Receiver - * sequence number it used for the ack packet; ack_ptr will equal - * buf_head; ack_ackno will equal buf_ackno; and ack_nonce will - * equal buf_nonce. + * When the buffer overflows, we keep no more than one record. This is + * the simplest way of disambiguating sender-Acks dating from before the + * overflow from sender-Acks which refer to after the overflow; a simple + * solution is preferable here since we are handling an exception. */ - avr->avr_ack_seqno = DCCP_SKB_CB(skb)->dccpd_seq; - avr->avr_ack_ptr = av->av_buf_head; - avr->avr_ack_ackno = av->av_buf_ackno; - avr->avr_ack_nonce = av->av_buf_nonce; - avr->avr_sent_len = av->av_vec_len; - - dccp_ackvec_insert_avr(av, avr); + if (av->av_overflow) + dccp_ackvec_purge_records(av); + /* + * Since GSS is incremented for each packet, the list is automatically + * arranged in descending order of @ack_seqno. + */ + list_add(&avr->avr_node, &av->av_records); - dccp_pr_debug("%s ACK Vector 0, len=%d, ack_seqno=%llu, " - "ack_ackno=%llu\n", - dccp_role(sk), avr->avr_sent_len, + dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n", (unsigned long long)avr->avr_ack_seqno, - (unsigned long long)avr->avr_ack_ackno); + (unsigned long long)avr->avr_ack_ackno, + avr->avr_ack_runlen); return 0; } -struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) +static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list, + const u64 ackno) { - struct dccp_ackvec *av = kmem_cache_alloc(dccp_ackvec_slab, priority); - - if (av != NULL) { - av->av_buf_head = DCCP_MAX_ACKVEC_LEN - 1; - av->av_buf_ackno = UINT48_MAX + 1; - av->av_buf_nonce = 0; - av->av_time = ktime_set(0, 0); - av->av_vec_len = 0; - INIT_LIST_HEAD(&av->av_records); + struct dccp_ackvec_record *avr; + /* + * Exploit that records are inserted in descending order of sequence + * number, start with the oldest record first. If @ackno is `before' + * the earliest ack_ackno, the packet is too old to be considered. + */ + list_for_each_entry_reverse(avr, av_list, avr_node) { + if (avr->avr_ack_seqno == ackno) + return avr; + if (before48(ackno, avr->avr_ack_seqno)) + break; } - - return av; + return NULL; } -void dccp_ackvec_free(struct dccp_ackvec *av) +/* + * Buffer index and length computation using modulo-buffersize arithmetic. + * Note that, as pointers move from right to left, head is `before' tail. + */ +static inline u16 __ackvec_idx_add(const u16 a, const u16 b) { - if (unlikely(av == NULL)) - return; - - if (!list_empty(&av->av_records)) { - struct dccp_ackvec_record *avr, *next; - - list_for_each_entry_safe(avr, next, &av->av_records, avr_node) { - list_del_init(&avr->avr_node); - dccp_ackvec_record_delete(avr); - } - } - - kmem_cache_free(dccp_ackvec_slab, av); + return (a + b) % DCCPAV_MAX_ACKVEC_LEN; } -static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const u32 index) +static inline u16 __ackvec_idx_sub(const u16 a, const u16 b) { - return av->av_buf[index] & DCCP_ACKVEC_STATE_MASK; + return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b); } -static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const u32 index) +u16 dccp_ackvec_buflen(const struct dccp_ackvec *av) { - return av->av_buf[index] & DCCP_ACKVEC_LEN_MASK; + if (unlikely(av->av_overflow)) + return DCCPAV_MAX_ACKVEC_LEN; + return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head); } -/* - * If several packets are missing, the HC-Receiver may prefer to enter multiple - * bytes with run length 0, rather than a single byte with a larger run length; - * this simplifies table updates if one of the missing packets arrives. +/** + * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1 + * @av: non-empty buffer to update + * @distance: negative or zero distance of @seqno from buf_ackno downward + * @seqno: the (old) sequence number whose record is to be updated + * @state: state in which packet carrying @seqno was received */ -static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, - const unsigned int packets, - const unsigned char state) +static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance, + u64 seqno, enum dccp_ackvec_states state) { - unsigned int gap; - long new_head; + u16 ptr = av->av_buf_head; - if (av->av_vec_len + packets > DCCP_MAX_ACKVEC_LEN) - return -ENOBUFS; + BUG_ON(distance > 0); + if (unlikely(dccp_ackvec_is_empty(av))) + return; - gap = packets - 1; - new_head = av->av_buf_head - packets; + do { + u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr); - if (new_head < 0) { - if (gap > 0) { - memset(av->av_buf, DCCP_ACKVEC_STATE_NOT_RECEIVED, - gap + new_head + 1); - gap = -new_head; + if (distance + runlen >= 0) { + /* + * Only update the state if packet has not been received + * yet. This is OK as per the second table in RFC 4340, + * 11.4.1; i.e. here we are using the following table: + * RECEIVED + * 0 1 3 + * S +---+---+---+ + * T 0 | 0 | 0 | 0 | + * O +---+---+---+ + * R 1 | 1 | 1 | 1 | + * E +---+---+---+ + * D 3 | 0 | 1 | 3 | + * +---+---+---+ + * The "Not Received" state was set by reserve_seats(). + */ + if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED) + av->av_buf[ptr] = state; + else + dccp_pr_debug("Not changing %llu state to %u\n", + (unsigned long long)seqno, state); + break; } - new_head += DCCP_MAX_ACKVEC_LEN; - } - av->av_buf_head = new_head; + distance += runlen + 1; + ptr = __ackvec_idx_add(ptr, 1); - if (gap > 0) - memset(av->av_buf + av->av_buf_head + 1, - DCCP_ACKVEC_STATE_NOT_RECEIVED, gap); + } while (ptr != av->av_buf_tail); +} - av->av_buf[av->av_buf_head] = state; - av->av_vec_len += packets; - return 0; +/* Mark @num entries after buf_head as "Not yet received". */ +static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num) +{ + u16 start = __ackvec_idx_add(av->av_buf_head, 1), + len = DCCPAV_MAX_ACKVEC_LEN - start; + + /* check for buffer wrap-around */ + if (num > len) { + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len); + start = 0; + num -= len; + } + if (num) + memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num); } -/* - * Implements the RFC 4340, Appendix A +/** + * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer + * @av: container of buffer to update (can be empty or non-empty) + * @num_packets: number of packets to register (must be >= 1) + * @seqno: sequence number of the first packet in @num_packets + * @state: state in which packet carrying @seqno was received */ -int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state) +static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets, + u64 seqno, enum dccp_ackvec_states state) { - /* - * Check at the right places if the buffer is full, if it is, tell the - * caller to sta |